Files
rdp-proxy/backend/internal/modules/cluster/service.go
T
2026-05-12 21:02:29 +03:00

12527 lines
482 KiB
Go

package cluster
import (
"context"
"crypto/rand"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"net"
"net/url"
"sort"
"strings"
"sync"
"time"
"github.com/jackc/pgx/v5"
"github.com/example/remote-access-platform/backend/internal/platform/clusterauth"
)
var (
ErrAccessDenied = errors.New("platform admin role is required")
ErrInvalidPayload = errors.New("invalid cluster payload")
ErrInvalidJoinToken = errors.New("invalid or expired join token")
ErrInvalidNodeRole = errors.New("invalid node role")
ErrInvalidCluster = errors.New("cluster not found")
ErrInvalidJoinRequest = errors.New("join request not found")
ErrClusterReadOnly = errors.New("cluster is not authoritative for policy mutation")
ErrInvalidVPNConnection = errors.New("vpn connection not found")
ErrInvalidVPNLease = errors.New("vpn connection lease not found")
ErrVPNLeaseAlreadyActive = errors.New("vpn connection already has an active lease")
ErrVPNLeaseOwnerNotAllowed = errors.New("vpn lease owner is not allowed")
ErrVPNLeaseOwnerRoleRequired = errors.New("vpn lease owner requires active vpn-exit or vpn-connector role")
)
type Service struct {
store Repository
now func() time.Time
fabricServiceChannelLeaseMu sync.Mutex
fabricServiceChannelLeaseCache map[string]FabricServiceChannelLease
}
const fabricServiceChannelFeedbackMaxAge = 2 * time.Minute
const fabricServiceChannelOperatorExpireCooldown = 2 * time.Minute
func NewService(store Repository) *Service {
return &Service{store: store, now: func() time.Time { return time.Now().UTC() }, fabricServiceChannelLeaseCache: map[string]FabricServiceChannelLease{}}
}
const (
clusterJoinTokenAuthoritySchema = "rap.cluster.join_token.v1"
clusterNodeApprovalAuthoritySchema = "rap.cluster.node_approval.v1"
clusterMeshConfigAuthoritySchema = "rap.cluster.mesh_config_snapshot.v1"
)
type clusterJoinTokenAuthorityPayload struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
TokenID string `json:"token_id"`
Scope json.RawMessage `json:"scope"`
ExpiresAt time.Time `json:"expires_at"`
MaxUses int `json:"max_uses"`
CreatedByUserID *string `json:"created_by_user_id,omitempty"`
IssuedAt time.Time `json:"issued_at"`
ControlPlaneOnly bool `json:"control_plane_only"`
ProductionForwarding bool `json:"production_forwarding"`
}
type clusterNodeApprovalAuthorityPayload struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
JoinRequestID string `json:"join_request_id"`
NodeID string `json:"node_id"`
NodeFingerprint string `json:"node_fingerprint"`
IdentityStatus string `json:"identity_status"`
HeartbeatEndpoint string `json:"heartbeat_endpoint"`
ApprovedByUserID string `json:"approved_by_user_id"`
IssuedAt time.Time `json:"issued_at"`
ControlPlaneOnly bool `json:"control_plane_only"`
ProductionForwarding bool `json:"production_forwarding"`
}
type clusterMeshConfigAuthorityPayload struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
LocalNodeID string `json:"local_node_id"`
ConfigVersion string `json:"config_version"`
ConfigSHA256 string `json:"config_sha256"`
IssuedAt time.Time `json:"issued_at"`
ExpiresAt time.Time `json:"expires_at"`
ControlPlaneOnly bool `json:"control_plane_only"`
ProductionForwarding bool `json:"production_forwarding"`
}
func (s *Service) ListClusters(ctx context.Context, actorUserID string) ([]Cluster, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListClusters(ctx)
}
func (s *Service) GetCluster(ctx context.Context, actorUserID, clusterID string) (Cluster, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return Cluster{}, err
}
item, err := s.store.GetCluster(ctx, clusterID)
if errors.Is(err, pgx.ErrNoRows) {
return Cluster{}, ErrInvalidCluster
}
return item, err
}
func (s *Service) GetFabricServiceChannelRecoveryPolicy(ctx context.Context, actorUserID, clusterID string) (FabricServiceChannelRecoveryPolicy, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return FabricServiceChannelRecoveryPolicy{}, err
}
cluster, err := s.store.GetCluster(ctx, strings.TrimSpace(clusterID))
if errors.Is(err, pgx.ErrNoRows) {
return FabricServiceChannelRecoveryPolicy{}, ErrInvalidCluster
}
if err != nil {
return FabricServiceChannelRecoveryPolicy{}, err
}
return fabricServiceChannelRecoveryPolicyFromCluster(cluster), nil
}
func (s *Service) UpdateFabricServiceChannelRecoveryPolicy(ctx context.Context, input UpdateFabricServiceChannelRecoveryPolicyInput) (FabricServiceChannelRecoveryPolicy, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return FabricServiceChannelRecoveryPolicy{}, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
if input.ClusterID == "" {
return FabricServiceChannelRecoveryPolicy{}, ErrInvalidCluster
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return FabricServiceChannelRecoveryPolicy{}, err
}
cluster, err := s.store.GetCluster(ctx, input.ClusterID)
if errors.Is(err, pgx.ErrNoRows) {
return FabricServiceChannelRecoveryPolicy{}, ErrInvalidCluster
}
if err != nil {
return FabricServiceChannelRecoveryPolicy{}, err
}
policy := fabricServiceChannelRecoveryPolicyFromCluster(cluster)
if input.HysteresisPenalty > 0 {
policy.HysteresisPenalty = clampInt(input.HysteresisPenalty, 0, 10000)
}
if input.PromotionMinSamples > 0 {
policy.PromotionMinSamples = clampInt(input.PromotionMinSamples, 1, 100000)
}
if input.DemotionFailureThreshold > 0 {
policy.DemotionFailureThreshold = clampInt(input.DemotionFailureThreshold, 1, 100000)
}
if input.DemotionDropThreshold > 0 {
policy.DemotionDropThreshold = clampInt(input.DemotionDropThreshold, 1, 100000)
}
if input.DemotionSlowThreshold > 0 {
policy.DemotionSlowThreshold = clampInt(input.DemotionSlowThreshold, 1, 100000)
}
if input.DemotionRebuildEnabled != nil {
policy.DemotionRebuildEnabled = *input.DemotionRebuildEnabled
}
if input.DemotionFencedEnabled != nil {
policy.DemotionFencedEnabled = *input.DemotionFencedEnabled
}
now := s.now().UTC()
policy.SchemaVersion = "rap.fabric_service_channel_recovery_policy.v1"
policy.Source = "cluster_metadata"
policy.UpdatedByUserID = &input.ActorUserID
policy.UpdatedAt = now
policy.ControlPlaneOnly = true
policy.ProductionForwarding = false
metadata, err := upsertFabricServiceChannelRecoveryPolicyMetadata(cluster.Metadata, policy)
if err != nil {
return FabricServiceChannelRecoveryPolicy{}, err
}
updated, err := s.store.UpdateCluster(ctx, UpdateClusterInput{
ActorUserID: input.ActorUserID,
ClusterID: cluster.ID,
Name: cluster.Name,
Status: cluster.Status,
Region: cluster.Region,
Metadata: metadata,
})
if err != nil {
return FabricServiceChannelRecoveryPolicy{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &cluster.ID,
ActorUserID: &input.ActorUserID,
EventType: "fabric.service_channel.recovery_policy.updated",
TargetType: "cluster",
TargetID: &cluster.ID,
Payload: metadata,
CreatedAt: now,
})
return fabricServiceChannelRecoveryPolicyFromCluster(updated), nil
}
func (s *Service) GetFabricServiceChannelAdaptivePolicy(ctx context.Context, actorUserID, clusterID string) (FabricServiceChannelAdaptivePolicy, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return FabricServiceChannelAdaptivePolicy{}, err
}
cluster, err := s.store.GetCluster(ctx, strings.TrimSpace(clusterID))
if errors.Is(err, pgx.ErrNoRows) {
return FabricServiceChannelAdaptivePolicy{}, ErrInvalidCluster
}
if err != nil {
return FabricServiceChannelAdaptivePolicy{}, err
}
return fabricServiceChannelAdaptivePolicyFromCluster(cluster), nil
}
func (s *Service) UpdateFabricServiceChannelAdaptivePolicy(ctx context.Context, input UpdateFabricServiceChannelAdaptivePolicyInput) (FabricServiceChannelAdaptivePolicy, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return FabricServiceChannelAdaptivePolicy{}, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
if input.ClusterID == "" {
return FabricServiceChannelAdaptivePolicy{}, ErrInvalidCluster
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return FabricServiceChannelAdaptivePolicy{}, err
}
cluster, err := s.store.GetCluster(ctx, input.ClusterID)
if errors.Is(err, pgx.ErrNoRows) {
return FabricServiceChannelAdaptivePolicy{}, ErrInvalidCluster
}
if err != nil {
return FabricServiceChannelAdaptivePolicy{}, err
}
policy := fabricServiceChannelAdaptivePolicyFromCluster(cluster)
if input.MaxParallelWindow > 0 {
policy.MaxParallelWindow = clampInt(input.MaxParallelWindow, 1, 64)
}
if input.BulkPressureChannelThreshold > 0 {
policy.BulkPressureChannelThreshold = clampInt(input.BulkPressureChannelThreshold, 1, 100000)
}
if input.QueuePressureHighWatermark > 0 {
policy.QueuePressureHighWatermark = clampInt(input.QueuePressureHighWatermark, 1, 100000)
}
if input.QueuePressureMaxInFlight > 0 {
policy.QueuePressureMaxInFlight = clampInt(input.QueuePressureMaxInFlight, 1, 100000)
}
if len(input.ClassWindows) > 0 {
policy.ClassWindows = normalizeFabricServiceChannelAdaptiveClassWindows(input.ClassWindows, policy.MaxParallelWindow)
}
now := s.now().UTC()
policy.SchemaVersion = "rap.fabric_service_channel_adaptive_policy.v1"
policy.Source = "cluster_metadata"
policy.UpdatedByUserID = &input.ActorUserID
policy.UpdatedAt = now
policy.ControlPlaneOnly = true
policy.ProductionForwarding = false
metadata, err := upsertFabricServiceChannelAdaptivePolicyMetadata(cluster.Metadata, policy)
if err != nil {
return FabricServiceChannelAdaptivePolicy{}, err
}
updated, err := s.store.UpdateCluster(ctx, UpdateClusterInput{
ActorUserID: input.ActorUserID,
ClusterID: cluster.ID,
Name: cluster.Name,
Status: cluster.Status,
Region: cluster.Region,
Metadata: metadata,
})
if err != nil {
return FabricServiceChannelAdaptivePolicy{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &cluster.ID,
ActorUserID: &input.ActorUserID,
EventType: "fabric.service_channel.adaptive_policy.updated",
TargetType: "cluster",
TargetID: &cluster.ID,
Payload: metadata,
CreatedAt: now,
})
return fabricServiceChannelAdaptivePolicyFromCluster(updated), nil
}
func (s *Service) GetFabricServiceChannelPoolPolicy(ctx context.Context, actorUserID, clusterID string) (FabricServiceChannelPoolPolicy, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return FabricServiceChannelPoolPolicy{}, err
}
cluster, err := s.store.GetCluster(ctx, strings.TrimSpace(clusterID))
if errors.Is(err, pgx.ErrNoRows) {
return FabricServiceChannelPoolPolicy{}, ErrInvalidCluster
}
if err != nil {
return FabricServiceChannelPoolPolicy{}, err
}
return fabricServiceChannelPoolPolicyFromCluster(cluster), nil
}
func (s *Service) UpdateFabricServiceChannelPoolPolicy(ctx context.Context, input UpdateFabricServiceChannelPoolPolicyInput) (FabricServiceChannelPoolPolicy, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return FabricServiceChannelPoolPolicy{}, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
if input.ClusterID == "" {
return FabricServiceChannelPoolPolicy{}, ErrInvalidCluster
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return FabricServiceChannelPoolPolicy{}, err
}
cluster, err := s.store.GetCluster(ctx, input.ClusterID)
if errors.Is(err, pgx.ErrNoRows) {
return FabricServiceChannelPoolPolicy{}, ErrInvalidCluster
}
if err != nil {
return FabricServiceChannelPoolPolicy{}, err
}
policy := fabricServiceChannelPoolPolicyFromCluster(cluster)
policy.EntryPoolNodeIDs = dedupeStrings(input.EntryPoolNodeIDs)
policy.ExitPoolNodeIDs = dedupeStrings(input.ExitPoolNodeIDs)
policy.PreferredEntryNodeID = strings.TrimSpace(input.PreferredEntryNodeID)
policy.PreferredExitNodeID = strings.TrimSpace(input.PreferredExitNodeID)
if input.SelectionStrategy != "" {
policy.SelectionStrategy = strings.TrimSpace(input.SelectionStrategy)
}
if input.RouteRebuild != "" {
policy.RouteRebuild = strings.TrimSpace(input.RouteRebuild)
}
if input.EntryFailover != "" {
policy.EntryFailover = strings.TrimSpace(input.EntryFailover)
}
if input.ExitFailover != "" {
policy.ExitFailover = strings.TrimSpace(input.ExitFailover)
}
if input.BackendFallbackAllowed != nil {
policy.BackendFallbackAllowed = *input.BackendFallbackAllowed
}
if input.StickySession != nil {
policy.StickySession = *input.StickySession
}
now := s.now().UTC()
policy.SchemaVersion = "rap.fabric_service_channel_pool_policy.v1"
policy.Source = "cluster_metadata"
policy.UpdatedByUserID = &input.ActorUserID
policy.UpdatedAt = now
policy.ControlPlaneOnly = true
policy.ProductionForwarding = false
policy = normalizeFabricServiceChannelPoolPolicy(policy, defaultFabricServiceChannelPoolPolicy())
metadata, err := upsertFabricServiceChannelPoolPolicyMetadata(cluster.Metadata, policy)
if err != nil {
return FabricServiceChannelPoolPolicy{}, err
}
updated, err := s.store.UpdateCluster(ctx, UpdateClusterInput{
ActorUserID: input.ActorUserID,
ClusterID: cluster.ID,
Name: cluster.Name,
Status: cluster.Status,
Region: cluster.Region,
Metadata: metadata,
})
if err != nil {
return FabricServiceChannelPoolPolicy{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &cluster.ID,
ActorUserID: &input.ActorUserID,
EventType: "fabric.service_channel.pool_policy.updated",
TargetType: "cluster",
TargetID: &cluster.ID,
Payload: metadata,
CreatedAt: now,
})
return fabricServiceChannelPoolPolicyFromCluster(updated), nil
}
func (s *Service) GetFabricServiceChannelBreadcrumbWindowPolicy(ctx context.Context, actorUserID, clusterID string) (FabricServiceChannelBreadcrumbWindowPolicy, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return FabricServiceChannelBreadcrumbWindowPolicy{}, err
}
cluster, err := s.store.GetCluster(ctx, strings.TrimSpace(clusterID))
if errors.Is(err, pgx.ErrNoRows) {
return FabricServiceChannelBreadcrumbWindowPolicy{}, ErrInvalidCluster
}
if err != nil {
return FabricServiceChannelBreadcrumbWindowPolicy{}, err
}
return fabricServiceChannelBreadcrumbWindowPolicyFromCluster(cluster), nil
}
func (s *Service) UpdateFabricServiceChannelBreadcrumbWindowPolicy(ctx context.Context, input UpdateFabricServiceChannelBreadcrumbWindowPolicyInput) (FabricServiceChannelBreadcrumbWindowPolicy, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return FabricServiceChannelBreadcrumbWindowPolicy{}, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
if input.ClusterID == "" {
return FabricServiceChannelBreadcrumbWindowPolicy{}, ErrInvalidCluster
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return FabricServiceChannelBreadcrumbWindowPolicy{}, err
}
cluster, err := s.store.GetCluster(ctx, input.ClusterID)
if errors.Is(err, pgx.ErrNoRows) {
return FabricServiceChannelBreadcrumbWindowPolicy{}, ErrInvalidCluster
}
if err != nil {
return FabricServiceChannelBreadcrumbWindowPolicy{}, err
}
policy := fabricServiceChannelBreadcrumbWindowPolicyFromCluster(cluster)
if input.CurrentWindowSeconds > 0 {
policy.CurrentWindowSeconds = input.CurrentWindowSeconds
}
if input.HistoryWindowSeconds > 0 {
policy.HistoryWindowSeconds = input.HistoryWindowSeconds
}
now := s.now().UTC()
policy.SchemaVersion = "rap.fabric_service_channel_breadcrumb_window_policy.v1"
policy.Source = "cluster_metadata"
policy.UpdatedByUserID = &input.ActorUserID
policy.UpdatedAt = now
policy.ControlPlaneOnly = true
policy.ProductionForwarding = false
policy = normalizeFabricServiceChannelBreadcrumbWindowPolicy(policy, defaultFabricServiceChannelBreadcrumbWindowPolicy())
metadata, err := upsertFabricServiceChannelBreadcrumbWindowPolicyMetadata(cluster.Metadata, policy)
if err != nil {
return FabricServiceChannelBreadcrumbWindowPolicy{}, err
}
updated, err := s.store.UpdateCluster(ctx, UpdateClusterInput{
ActorUserID: input.ActorUserID,
ClusterID: cluster.ID,
Name: cluster.Name,
Status: cluster.Status,
Region: cluster.Region,
Metadata: metadata,
})
if err != nil {
return FabricServiceChannelBreadcrumbWindowPolicy{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &cluster.ID,
ActorUserID: &input.ActorUserID,
EventType: "fabric.service_channel.breadcrumb_window_policy.updated",
TargetType: "cluster",
TargetID: &cluster.ID,
Payload: metadata,
CreatedAt: now,
})
return fabricServiceChannelBreadcrumbWindowPolicyFromCluster(updated), nil
}
func (s *Service) CreateCluster(ctx context.Context, input CreateClusterInput) (Cluster, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return Cluster{}, err
}
input.Slug = strings.TrimSpace(input.Slug)
input.Name = strings.TrimSpace(input.Name)
if input.Slug == "" || input.Name == "" {
return Cluster{}, ErrInvalidPayload
}
input.Metadata = defaultJSON(input.Metadata, `{}`)
if !json.Valid(input.Metadata) {
return Cluster{}, errors.New("metadata must be valid json")
}
item, err := s.store.CreateCluster(ctx, input)
if err != nil {
return Cluster{}, err
}
auditPayload := json.RawMessage(`{}`)
if authorityKey, err := s.ensureClusterAuthority(ctx, item.ID, &input.ActorUserID); err == nil {
auditPayload, _ = json.Marshal(map[string]any{
"cluster_authority": map[string]any{
"key_algorithm": authorityKey.KeyAlgorithm,
"public_key_fingerprint": authorityKey.PublicKeyFingerprint,
},
})
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &item.ID,
ActorUserID: &input.ActorUserID,
EventType: "cluster.created",
TargetType: "cluster",
TargetID: &item.ID,
Payload: auditPayload,
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) ensureClusterAuthority(ctx context.Context, clusterID string, actorUserID *string) (ClusterAuthorityKey, error) {
authorityKey, err := s.store.GetClusterAuthority(ctx, clusterID)
if errors.Is(err, pgx.ErrNoRows) {
return s.store.EnsureClusterAuthority(ctx, clusterID, actorUserID)
}
return authorityKey, err
}
func authorityDescriptor(authorityKey ClusterAuthorityKey) *ClusterAuthorityDescriptor {
descriptor := authorityKey.ClusterAuthorityDescriptor
if descriptor.SchemaVersion == "" {
descriptor.SchemaVersion = clusterauth.AuthoritySchemaVersion
}
return &descriptor
}
func defaultFabricServiceChannelRecoveryPolicy() FabricServiceChannelRecoveryPolicy {
return FabricServiceChannelRecoveryPolicy{
SchemaVersion: "rap.fabric_service_channel_recovery_policy.v1",
HysteresisPenalty: fabricServiceChannelRecoveryHysteresisPenalty,
PromotionMinSamples: fabricServiceChannelRecoveryPromotionMinSamples,
DemotionFailureThreshold: 1,
DemotionDropThreshold: 1,
DemotionSlowThreshold: 1,
DemotionRebuildEnabled: true,
DemotionFencedEnabled: true,
Source: "defaults",
ControlPlaneOnly: true,
ProductionForwarding: false,
}
}
func fabricServiceChannelRecoveryPolicyFromCluster(cluster Cluster) FabricServiceChannelRecoveryPolicy {
policy := defaultFabricServiceChannelRecoveryPolicy()
if len(cluster.Metadata) == 0 || !json.Valid(cluster.Metadata) {
return policy
}
var raw struct {
Policy *FabricServiceChannelRecoveryPolicy `json:"fabric_service_channel_recovery_policy"`
}
if err := json.Unmarshal(cluster.Metadata, &raw); err != nil || raw.Policy == nil {
return policy
}
policy = normalizeFabricServiceChannelRecoveryPolicy(*raw.Policy, policy)
policy.Source = "cluster_metadata"
return policy
}
func normalizeFabricServiceChannelRecoveryPolicy(input FabricServiceChannelRecoveryPolicy, fallback FabricServiceChannelRecoveryPolicy) FabricServiceChannelRecoveryPolicy {
if input.SchemaVersion == "" {
input.SchemaVersion = "rap.fabric_service_channel_recovery_policy.v1"
}
if input.HysteresisPenalty < 0 {
input.HysteresisPenalty = fallback.HysteresisPenalty
}
if input.HysteresisPenalty == 0 {
input.HysteresisPenalty = fallback.HysteresisPenalty
}
if input.PromotionMinSamples <= 0 {
input.PromotionMinSamples = fallback.PromotionMinSamples
}
if input.DemotionFailureThreshold <= 0 {
input.DemotionFailureThreshold = fallback.DemotionFailureThreshold
}
if input.DemotionDropThreshold <= 0 {
input.DemotionDropThreshold = fallback.DemotionDropThreshold
}
if input.DemotionSlowThreshold <= 0 {
input.DemotionSlowThreshold = fallback.DemotionSlowThreshold
}
if input.Source == "" {
input.Source = fallback.Source
}
input.ControlPlaneOnly = true
input.ProductionForwarding = false
input.Fingerprint = fabricServiceChannelRecoveryPolicyFingerprint(input)
return input
}
func upsertFabricServiceChannelRecoveryPolicyMetadata(metadata json.RawMessage, policy FabricServiceChannelRecoveryPolicy) (json.RawMessage, error) {
raw := map[string]any{}
if len(metadata) > 0 && json.Valid(metadata) {
if err := json.Unmarshal(metadata, &raw); err != nil {
return nil, err
}
}
raw["fabric_service_channel_recovery_policy"] = policy
out, err := json.Marshal(raw)
if err != nil {
return nil, err
}
return json.RawMessage(out), nil
}
func fabricServiceChannelRecoveryPolicyRef(policy FabricServiceChannelRecoveryPolicy) *FabricServiceChannelRecoveryPolicy {
normalized := normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy())
return &normalized
}
func fabricServiceChannelRecoveryPolicyFingerprint(policy FabricServiceChannelRecoveryPolicy) string {
policy.Fingerprint = ""
policy.UpdatedAt = time.Time{}
policy.UpdatedByUserID = nil
raw, err := json.Marshal(struct {
SchemaVersion string `json:"schema_version"`
HysteresisPenalty int `json:"hysteresis_penalty"`
PromotionMinSamples int `json:"promotion_min_samples"`
DemotionFailureThreshold int `json:"demotion_failure_threshold"`
DemotionDropThreshold int `json:"demotion_drop_threshold"`
DemotionSlowThreshold int `json:"demotion_slow_threshold"`
DemotionRebuildEnabled bool `json:"demotion_rebuild_enabled"`
DemotionFencedEnabled bool `json:"demotion_fenced_enabled"`
ControlPlaneOnly bool `json:"control_plane_only"`
ProductionForwarding bool `json:"production_forwarding"`
}{
SchemaVersion: policy.SchemaVersion,
HysteresisPenalty: policy.HysteresisPenalty,
PromotionMinSamples: policy.PromotionMinSamples,
DemotionFailureThreshold: policy.DemotionFailureThreshold,
DemotionDropThreshold: policy.DemotionDropThreshold,
DemotionSlowThreshold: policy.DemotionSlowThreshold,
DemotionRebuildEnabled: policy.DemotionRebuildEnabled,
DemotionFencedEnabled: policy.DemotionFencedEnabled,
ControlPlaneOnly: true,
ProductionForwarding: false,
})
if err != nil {
return ""
}
sum := sha256.Sum256(raw)
return hex.EncodeToString(sum[:])
}
func defaultFabricServiceChannelAdaptivePolicy() FabricServiceChannelAdaptivePolicy {
return normalizeFabricServiceChannelAdaptivePolicy(FabricServiceChannelAdaptivePolicy{
SchemaVersion: "rap.fabric_service_channel_adaptive_policy.v1",
MaxParallelWindow: 4,
BulkPressureChannelThreshold: 16,
QueuePressureHighWatermark: 16,
QueuePressureMaxInFlight: 16,
ClassWindows: map[string]int{
"control": 4,
"interactive": 4,
"reliable": 3,
"bulk": 1,
"droppable": 1,
},
Source: "defaults",
ControlPlaneOnly: true,
ProductionForwarding: false,
}, FabricServiceChannelAdaptivePolicy{})
}
func fabricServiceChannelAdaptivePolicyFromCluster(cluster Cluster) FabricServiceChannelAdaptivePolicy {
fallback := defaultFabricServiceChannelAdaptivePolicy()
if len(cluster.Metadata) == 0 || !json.Valid(cluster.Metadata) {
return fallback
}
var raw struct {
Policy *FabricServiceChannelAdaptivePolicy `json:"fabric_service_channel_adaptive_policy"`
}
if err := json.Unmarshal(cluster.Metadata, &raw); err != nil || raw.Policy == nil {
return fallback
}
policy := normalizeFabricServiceChannelAdaptivePolicy(*raw.Policy, fallback)
policy.Source = "cluster_metadata"
return policy
}
func normalizeFabricServiceChannelAdaptivePolicy(input FabricServiceChannelAdaptivePolicy, fallback FabricServiceChannelAdaptivePolicy) FabricServiceChannelAdaptivePolicy {
if input.SchemaVersion == "" {
input.SchemaVersion = "rap.fabric_service_channel_adaptive_policy.v1"
}
if fallback.MaxParallelWindow <= 0 {
fallback.MaxParallelWindow = 4
}
if input.MaxParallelWindow <= 0 {
input.MaxParallelWindow = fallback.MaxParallelWindow
}
input.MaxParallelWindow = clampInt(input.MaxParallelWindow, 1, 64)
if input.BulkPressureChannelThreshold <= 0 {
input.BulkPressureChannelThreshold = firstPositive(fallback.BulkPressureChannelThreshold, 16)
}
if input.QueuePressureHighWatermark <= 0 {
input.QueuePressureHighWatermark = firstPositive(fallback.QueuePressureHighWatermark, 16)
}
if input.QueuePressureMaxInFlight <= 0 {
input.QueuePressureMaxInFlight = firstPositive(fallback.QueuePressureMaxInFlight, 16)
}
input.ClassWindows = normalizeFabricServiceChannelAdaptiveClassWindows(firstNonNilStringIntMap(input.ClassWindows, fallback.ClassWindows), input.MaxParallelWindow)
if input.Source == "" {
input.Source = fallback.Source
}
if input.Source == "" {
input.Source = "defaults"
}
input.ControlPlaneOnly = true
input.ProductionForwarding = false
input.Fingerprint = fabricServiceChannelAdaptivePolicyFingerprint(input)
return input
}
func normalizeFabricServiceChannelAdaptiveClassWindows(values map[string]int, maxWindow int) map[string]int {
if maxWindow <= 0 {
maxWindow = 4
}
defaults := map[string]int{"control": maxWindow, "interactive": maxWindow, "reliable": boundedMinInt(maxWindow, 3), "bulk": 1, "droppable": 1}
out := map[string]int{}
for key, fallback := range defaults {
value := values[key]
if value <= 0 {
value = fallback
}
out[key] = clampInt(value, 1, maxWindow)
}
return out
}
func upsertFabricServiceChannelAdaptivePolicyMetadata(metadata json.RawMessage, policy FabricServiceChannelAdaptivePolicy) (json.RawMessage, error) {
raw := map[string]any{}
if len(metadata) > 0 && json.Valid(metadata) {
if err := json.Unmarshal(metadata, &raw); err != nil {
return nil, err
}
}
raw["fabric_service_channel_adaptive_policy"] = policy
out, err := json.Marshal(raw)
if err != nil {
return nil, err
}
return json.RawMessage(out), nil
}
func fabricServiceChannelAdaptivePolicyFingerprint(policy FabricServiceChannelAdaptivePolicy) string {
raw, err := json.Marshal(struct {
SchemaVersion string `json:"schema_version"`
MaxParallelWindow int `json:"max_parallel_window"`
BulkPressureChannelThreshold int `json:"bulk_pressure_channel_threshold"`
QueuePressureHighWatermark int `json:"queue_pressure_high_watermark"`
QueuePressureMaxInFlight int `json:"queue_pressure_max_in_flight"`
ClassWindows map[string]int `json:"class_windows"`
ControlPlaneOnly bool `json:"control_plane_only"`
ProductionForwarding bool `json:"production_forwarding"`
}{
SchemaVersion: policy.SchemaVersion,
MaxParallelWindow: policy.MaxParallelWindow,
BulkPressureChannelThreshold: policy.BulkPressureChannelThreshold,
QueuePressureHighWatermark: policy.QueuePressureHighWatermark,
QueuePressureMaxInFlight: policy.QueuePressureMaxInFlight,
ClassWindows: policy.ClassWindows,
ControlPlaneOnly: true,
ProductionForwarding: false,
})
if err != nil {
return ""
}
sum := sha256.Sum256(raw)
return hex.EncodeToString(sum[:])
}
func defaultFabricServiceChannelPoolPolicy() FabricServiceChannelPoolPolicy {
return normalizeFabricServiceChannelPoolPolicy(FabricServiceChannelPoolPolicy{
SchemaVersion: "rap.fabric_service_channel_pool_policy.v1",
SelectionStrategy: "fastest_healthy",
RouteRebuild: "automatic",
EntryFailover: "automatic",
ExitFailover: "automatic",
BackendFallbackAllowed: true,
StickySession: true,
Source: "defaults",
ControlPlaneOnly: true,
ProductionForwarding: false,
}, FabricServiceChannelPoolPolicy{})
}
func fabricServiceChannelPoolPolicyFromCluster(cluster Cluster) FabricServiceChannelPoolPolicy {
fallback := defaultFabricServiceChannelPoolPolicy()
if len(cluster.Metadata) == 0 || !json.Valid(cluster.Metadata) {
return fallback
}
var raw struct {
Policy *FabricServiceChannelPoolPolicy `json:"fabric_service_channel_pool_policy"`
}
if err := json.Unmarshal(cluster.Metadata, &raw); err != nil || raw.Policy == nil {
return fallback
}
policy := normalizeFabricServiceChannelPoolPolicy(*raw.Policy, fallback)
policy.Source = "cluster_metadata"
return policy
}
func normalizeFabricServiceChannelPoolPolicy(input FabricServiceChannelPoolPolicy, fallback FabricServiceChannelPoolPolicy) FabricServiceChannelPoolPolicy {
if input.SchemaVersion == "" {
input.SchemaVersion = firstNonEmptyString(fallback.SchemaVersion, "rap.fabric_service_channel_pool_policy.v1")
}
input.EntryPoolNodeIDs = dedupeStrings(firstNonEmptyStringSlice(input.EntryPoolNodeIDs, fallback.EntryPoolNodeIDs))
input.ExitPoolNodeIDs = dedupeStrings(firstNonEmptyStringSlice(input.ExitPoolNodeIDs, fallback.ExitPoolNodeIDs))
input.PreferredEntryNodeID = strings.TrimSpace(firstNonEmptyString(input.PreferredEntryNodeID, fallback.PreferredEntryNodeID))
input.PreferredExitNodeID = strings.TrimSpace(firstNonEmptyString(input.PreferredExitNodeID, fallback.PreferredExitNodeID))
input.SelectionStrategy = normalizeFabricServiceChannelPoolPolicyMode(firstNonEmptyString(input.SelectionStrategy, fallback.SelectionStrategy), []string{"fastest_healthy", "preferred_first", "stable_first"}, "fastest_healthy")
input.RouteRebuild = normalizeFabricServiceChannelPoolPolicyMode(firstNonEmptyString(input.RouteRebuild, fallback.RouteRebuild), []string{"automatic", "manual", "disabled"}, "automatic")
input.EntryFailover = normalizeFabricServiceChannelPoolPolicyMode(firstNonEmptyString(input.EntryFailover, fallback.EntryFailover), []string{"automatic", "manual", "disabled"}, "automatic")
input.ExitFailover = normalizeFabricServiceChannelPoolPolicyMode(firstNonEmptyString(input.ExitFailover, fallback.ExitFailover), []string{"automatic", "manual", "disabled"}, "automatic")
if input.Source == "" {
input.Source = firstNonEmptyString(fallback.Source, "defaults")
}
input.ControlPlaneOnly = true
input.ProductionForwarding = false
input.Fingerprint = fabricServiceChannelPoolPolicyFingerprint(input)
return input
}
func normalizeFabricServiceChannelPoolPolicyMode(value string, allowed []string, fallback string) string {
value = strings.TrimSpace(strings.ToLower(value))
for _, item := range allowed {
if value == item {
return value
}
}
return fallback
}
func upsertFabricServiceChannelPoolPolicyMetadata(metadata json.RawMessage, policy FabricServiceChannelPoolPolicy) (json.RawMessage, error) {
raw := map[string]any{}
if len(metadata) > 0 && json.Valid(metadata) {
if err := json.Unmarshal(metadata, &raw); err != nil {
return nil, err
}
}
raw["fabric_service_channel_pool_policy"] = policy
out, err := json.Marshal(raw)
if err != nil {
return nil, err
}
return json.RawMessage(out), nil
}
func fabricServiceChannelPoolPolicyRef(policy FabricServiceChannelPoolPolicy) *FabricServiceChannelPoolPolicy {
normalized := normalizeFabricServiceChannelPoolPolicy(policy, defaultFabricServiceChannelPoolPolicy())
return &normalized
}
func fabricServiceChannelPoolPolicyFingerprint(policy FabricServiceChannelPoolPolicy) string {
raw, err := json.Marshal(struct {
SchemaVersion string `json:"schema_version"`
EntryPoolNodeIDs []string `json:"entry_pool_node_ids,omitempty"`
ExitPoolNodeIDs []string `json:"exit_pool_node_ids,omitempty"`
PreferredEntryNodeID string `json:"preferred_entry_node_id,omitempty"`
PreferredExitNodeID string `json:"preferred_exit_node_id,omitempty"`
SelectionStrategy string `json:"selection_strategy"`
RouteRebuild string `json:"route_rebuild"`
EntryFailover string `json:"entry_failover"`
ExitFailover string `json:"exit_failover"`
BackendFallbackAllowed bool `json:"backend_fallback_allowed"`
StickySession bool `json:"sticky_session"`
ControlPlaneOnly bool `json:"control_plane_only"`
ProductionForwarding bool `json:"production_forwarding"`
}{
SchemaVersion: policy.SchemaVersion,
EntryPoolNodeIDs: policy.EntryPoolNodeIDs,
ExitPoolNodeIDs: policy.ExitPoolNodeIDs,
PreferredEntryNodeID: policy.PreferredEntryNodeID,
PreferredExitNodeID: policy.PreferredExitNodeID,
SelectionStrategy: policy.SelectionStrategy,
RouteRebuild: policy.RouteRebuild,
EntryFailover: policy.EntryFailover,
ExitFailover: policy.ExitFailover,
BackendFallbackAllowed: policy.BackendFallbackAllowed,
StickySession: policy.StickySession,
ControlPlaneOnly: true,
ProductionForwarding: false,
})
if err != nil {
return ""
}
sum := sha256.Sum256(raw)
return hex.EncodeToString(sum[:])
}
func defaultFabricServiceChannelBreadcrumbWindowPolicy() FabricServiceChannelBreadcrumbWindowPolicy {
return normalizeFabricServiceChannelBreadcrumbWindowPolicy(FabricServiceChannelBreadcrumbWindowPolicy{
SchemaVersion: "rap.fabric_service_channel_breadcrumb_window_policy.v1",
CurrentWindowSeconds: int64((30 * time.Minute).Seconds()),
HistoryWindowSeconds: int64((24 * time.Hour).Seconds()),
Source: "defaults",
ControlPlaneOnly: true,
ProductionForwarding: false,
}, FabricServiceChannelBreadcrumbWindowPolicy{})
}
func fabricServiceChannelBreadcrumbWindowPolicyFromCluster(cluster Cluster) FabricServiceChannelBreadcrumbWindowPolicy {
fallback := defaultFabricServiceChannelBreadcrumbWindowPolicy()
if len(cluster.Metadata) == 0 || !json.Valid(cluster.Metadata) {
return fallback
}
var raw struct {
Policy *FabricServiceChannelBreadcrumbWindowPolicy `json:"fabric_service_channel_breadcrumb_window_policy"`
}
if err := json.Unmarshal(cluster.Metadata, &raw); err != nil || raw.Policy == nil {
return fallback
}
policy := normalizeFabricServiceChannelBreadcrumbWindowPolicy(*raw.Policy, fallback)
policy.Source = "cluster_metadata"
return policy
}
func normalizeFabricServiceChannelBreadcrumbWindowPolicy(input FabricServiceChannelBreadcrumbWindowPolicy, fallback FabricServiceChannelBreadcrumbWindowPolicy) FabricServiceChannelBreadcrumbWindowPolicy {
if input.SchemaVersion == "" {
input.SchemaVersion = firstNonEmptyString(fallback.SchemaVersion, "rap.fabric_service_channel_breadcrumb_window_policy.v1")
}
if input.CurrentWindowSeconds <= 0 {
input.CurrentWindowSeconds = firstPositiveInt64(fallback.CurrentWindowSeconds, int64((30 * time.Minute).Seconds()))
}
if input.HistoryWindowSeconds <= 0 {
input.HistoryWindowSeconds = firstPositiveInt64(fallback.HistoryWindowSeconds, int64((24 * time.Hour).Seconds()))
}
input.CurrentWindowSeconds = clampInt64(input.CurrentWindowSeconds, 60, int64((7 * 24 * time.Hour).Seconds()))
input.HistoryWindowSeconds = clampInt64(input.HistoryWindowSeconds, input.CurrentWindowSeconds, int64((30 * 24 * time.Hour).Seconds()))
if input.Source == "" {
input.Source = firstNonEmptyString(fallback.Source, "defaults")
}
input.ControlPlaneOnly = true
input.ProductionForwarding = false
input.Fingerprint = fabricServiceChannelBreadcrumbWindowPolicyFingerprint(input)
return input
}
func upsertFabricServiceChannelBreadcrumbWindowPolicyMetadata(metadata json.RawMessage, policy FabricServiceChannelBreadcrumbWindowPolicy) (json.RawMessage, error) {
raw := map[string]any{}
if len(metadata) > 0 && json.Valid(metadata) {
if err := json.Unmarshal(metadata, &raw); err != nil {
return nil, err
}
}
raw["fabric_service_channel_breadcrumb_window_policy"] = policy
out, err := json.Marshal(raw)
if err != nil {
return nil, err
}
return json.RawMessage(out), nil
}
func fabricServiceChannelBreadcrumbWindowPolicyFingerprint(policy FabricServiceChannelBreadcrumbWindowPolicy) string {
raw, err := json.Marshal(struct {
SchemaVersion string `json:"schema_version"`
CurrentWindowSeconds int64 `json:"current_window_seconds"`
HistoryWindowSeconds int64 `json:"history_window_seconds"`
ControlPlaneOnly bool `json:"control_plane_only"`
ProductionForwarding bool `json:"production_forwarding"`
}{
SchemaVersion: policy.SchemaVersion,
CurrentWindowSeconds: policy.CurrentWindowSeconds,
HistoryWindowSeconds: policy.HistoryWindowSeconds,
ControlPlaneOnly: true,
ProductionForwarding: false,
})
if err != nil {
return ""
}
sum := sha256.Sum256(raw)
return hex.EncodeToString(sum[:])
}
func firstNonEmptyStringSlice(values ...[]string) []string {
for _, value := range values {
if len(value) > 0 {
return value
}
}
return nil
}
func firstPositive(values ...int) int {
for _, value := range values {
if value > 0 {
return value
}
}
return 0
}
func firstPositiveInt64(values ...int64) int64 {
for _, value := range values {
if value > 0 {
return value
}
}
return 0
}
func firstNonNilStringIntMap(values ...map[string]int) map[string]int {
for _, value := range values {
if len(value) > 0 {
return value
}
}
return nil
}
func boundedMinInt(a, b int) int {
if a < b {
return a
}
return b
}
func clampInt(value, minValue, maxValue int) int {
if value < minValue {
return minValue
}
if value > maxValue {
return maxValue
}
return value
}
func clampInt64(value, minValue, maxValue int64) int64 {
if value < minValue {
return minValue
}
if value > maxValue {
return maxValue
}
return value
}
func (s *Service) UpdateCluster(ctx context.Context, input UpdateClusterInput) (Cluster, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return Cluster{}, err
}
if input.ClusterID == "" {
return Cluster{}, ErrInvalidCluster
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return Cluster{}, err
}
input.Name = strings.TrimSpace(input.Name)
input.Status = strings.TrimSpace(input.Status)
if input.Name == "" {
return Cluster{}, ErrInvalidPayload
}
if input.Status == "" {
input.Status = ClusterStatusActive
}
if input.Status != ClusterStatusActive && input.Status != ClusterStatusDisabled {
return Cluster{}, ErrInvalidPayload
}
input.Metadata = defaultJSON(input.Metadata, `{}`)
if !json.Valid(input.Metadata) {
return Cluster{}, errors.New("metadata must be valid json")
}
item, err := s.store.UpdateCluster(ctx, input)
if errors.Is(err, pgx.ErrNoRows) {
return Cluster{}, ErrInvalidCluster
}
if err != nil {
return Cluster{}, err
}
payload, _ := json.Marshal(map[string]any{
"name": item.Name,
"status": item.Status,
"region": item.Region,
})
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &item.ID,
ActorUserID: &input.ActorUserID,
EventType: "cluster.updated",
TargetType: "cluster",
TargetID: &item.ID,
Payload: payload,
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) ListClusterNodes(ctx context.Context, actorUserID, clusterID string) ([]ClusterNode, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListClusterNodes(ctx, clusterID)
}
func (s *Service) ListNodeGroups(ctx context.Context, actorUserID, clusterID string) ([]ClusterNodeGroup, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListNodeGroups(ctx, clusterID)
}
func (s *Service) CreateNodeGroup(ctx context.Context, input CreateNodeGroupInput) (ClusterNodeGroup, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return ClusterNodeGroup{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return ClusterNodeGroup{}, err
}
input.Name = strings.TrimSpace(input.Name)
if input.ClusterID == "" || input.Name == "" {
return ClusterNodeGroup{}, ErrInvalidPayload
}
if input.Description != nil {
trimmed := strings.TrimSpace(*input.Description)
input.Description = &trimmed
}
input.Metadata = defaultJSON(input.Metadata, `{}`)
if !json.Valid(input.Metadata) {
return ClusterNodeGroup{}, errors.New("node group metadata must be valid json")
}
item, err := s.store.CreateNodeGroup(ctx, input)
if errors.Is(err, pgx.ErrNoRows) {
return ClusterNodeGroup{}, ErrInvalidPayload
}
return item, err
}
func (s *Service) CreateJoinToken(ctx context.Context, input CreateJoinTokenInput) (CreatedJoinToken, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return CreatedJoinToken{}, err
}
if input.ClusterID == "" {
return CreatedJoinToken{}, ErrInvalidCluster
}
input.Scope = defaultJSON(input.Scope, `{}`)
if !json.Valid(input.Scope) {
return CreatedJoinToken{}, errors.New("scope must be valid json")
}
if input.ExpiresAt.IsZero() {
input.ExpiresAt = defaultJoinTokenExpiry(s.now())
}
if input.ExpiresAt.Before(s.now()) {
return CreatedJoinToken{}, errors.New("expires_at must be in the future")
}
if input.MaxUses <= 0 {
input.MaxUses = 1
}
rawToken, err := generateJoinToken()
if err != nil {
return CreatedJoinToken{}, err
}
tokenHash, err := hashJoinToken(rawToken)
if err != nil {
return CreatedJoinToken{}, err
}
item, err := s.store.CreateJoinToken(ctx, input, tokenHash)
if err != nil {
return CreatedJoinToken{}, err
}
item, err = s.signJoinToken(ctx, input, item)
if err != nil {
return CreatedJoinToken{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "node_join_token.created",
TargetType: "node_join_token",
TargetID: &item.ID,
Payload: json.RawMessage(`{"raw_token_returned_once":true}`),
CreatedAt: s.now(),
})
return CreatedJoinToken{NodeJoinToken: item, Token: rawToken}, nil
}
func (s *Service) ListJoinTokens(ctx context.Context, actorUserID, clusterID string) ([]NodeJoinToken, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
if err := s.store.ExpireJoinTokens(ctx, clusterID); err != nil {
return nil, err
}
return s.store.ListJoinTokens(ctx, clusterID)
}
func (s *Service) GetDockerInstallProfile(ctx context.Context, input DockerInstallProfileRequest) (DockerInstallProfile, error) {
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.InstallToken = strings.TrimSpace(input.InstallToken)
if input.ClusterID == "" || input.InstallToken == "" {
return DockerInstallProfile{}, ErrInvalidPayload
}
if err := s.store.ExpireJoinTokens(ctx, input.ClusterID); err != nil {
return DockerInstallProfile{}, err
}
tokenHash, err := hashJoinToken(input.InstallToken)
if err != nil {
return DockerInstallProfile{}, ErrInvalidJoinToken
}
token, err := s.store.GetValidJoinTokenByHash(ctx, input.ClusterID, tokenHash)
if err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return DockerInstallProfile{}, ErrInvalidJoinToken
}
return DockerInstallProfile{}, err
}
profile, err := dockerInstallProfileFromScope(input, token.Scope)
if err != nil {
return DockerInstallProfile{}, err
}
profile.ClusterID = input.ClusterID
profile.JoinToken = input.InstallToken
return profile, nil
}
func (s *Service) GetWindowsInstallProfile(ctx context.Context, input DockerInstallProfileRequest) (WindowsInstallProfile, error) {
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.InstallToken = strings.TrimSpace(input.InstallToken)
if input.ClusterID == "" || input.InstallToken == "" {
return WindowsInstallProfile{}, ErrInvalidPayload
}
if err := s.store.ExpireJoinTokens(ctx, input.ClusterID); err != nil {
return WindowsInstallProfile{}, err
}
tokenHash, err := hashJoinToken(input.InstallToken)
if err != nil {
return WindowsInstallProfile{}, ErrInvalidJoinToken
}
token, err := s.store.GetValidJoinTokenByHash(ctx, input.ClusterID, tokenHash)
if err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return WindowsInstallProfile{}, ErrInvalidJoinToken
}
return WindowsInstallProfile{}, err
}
profile, err := windowsInstallProfileFromScope(input, token.Scope)
if err != nil {
return WindowsInstallProfile{}, err
}
profile.ClusterID = input.ClusterID
profile.JoinToken = input.InstallToken
return profile, nil
}
func (s *Service) GetLinuxInstallProfile(ctx context.Context, input DockerInstallProfileRequest) (LinuxInstallProfile, error) {
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.InstallToken = strings.TrimSpace(input.InstallToken)
if input.ClusterID == "" || input.InstallToken == "" {
return LinuxInstallProfile{}, ErrInvalidPayload
}
if err := s.store.ExpireJoinTokens(ctx, input.ClusterID); err != nil {
return LinuxInstallProfile{}, err
}
tokenHash, err := hashJoinToken(input.InstallToken)
if err != nil {
return LinuxInstallProfile{}, ErrInvalidJoinToken
}
token, err := s.store.GetValidJoinTokenByHash(ctx, input.ClusterID, tokenHash)
if err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return LinuxInstallProfile{}, ErrInvalidJoinToken
}
return LinuxInstallProfile{}, err
}
profile, err := linuxInstallProfileFromScope(input, token.Scope)
if err != nil {
return LinuxInstallProfile{}, err
}
profile.ClusterID = input.ClusterID
profile.JoinToken = input.InstallToken
return profile, nil
}
func (s *Service) signJoinToken(ctx context.Context, input CreateJoinTokenInput, item NodeJoinToken) (NodeJoinToken, error) {
authorityKey, err := s.ensureClusterAuthority(ctx, input.ClusterID, &input.ActorUserID)
if err != nil {
return NodeJoinToken{}, err
}
payload := clusterJoinTokenAuthorityPayload{
SchemaVersion: clusterJoinTokenAuthoritySchema,
ClusterID: input.ClusterID,
TokenID: item.ID,
Scope: item.Scope,
ExpiresAt: item.ExpiresAt,
MaxUses: item.MaxUses,
CreatedByUserID: item.CreatedByUserID,
IssuedAt: item.CreatedAt,
ControlPlaneOnly: true,
ProductionForwarding: false,
}
rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now())
if err != nil {
return NodeJoinToken{}, err
}
return s.store.SetJoinTokenAuthority(ctx, input.ClusterID, item.ID, rawPayload, signature)
}
func (s *Service) CreateJoinRequest(ctx context.Context, input CreateJoinRequestInput) (NodeJoinRequest, error) {
if input.ClusterID == "" {
return NodeJoinRequest{}, ErrInvalidCluster
}
if err := s.store.ExpireJoinTokens(ctx, input.ClusterID); err != nil {
return NodeJoinRequest{}, err
}
input.NodeName = strings.TrimSpace(input.NodeName)
input.NodeFingerprint = strings.TrimSpace(input.NodeFingerprint)
input.PublicKey = strings.TrimSpace(input.PublicKey)
if input.NodeName == "" || input.NodeFingerprint == "" || input.PublicKey == "" {
return NodeJoinRequest{}, ErrInvalidPayload
}
input.ReportedCapabilities = defaultJSON(input.ReportedCapabilities, `{}`)
input.ReportedFacts = defaultJSON(input.ReportedFacts, `{}`)
input.RequestedRoles = defaultJSON(input.RequestedRoles, `[]`)
if !json.Valid(input.ReportedCapabilities) || !json.Valid(input.ReportedFacts) || !json.Valid(input.RequestedRoles) {
return NodeJoinRequest{}, errors.New("reported_capabilities, reported_facts, and requested_roles must be valid json")
}
tokenHash, err := hashJoinToken(input.JoinToken)
if err != nil {
return NodeJoinRequest{}, ErrInvalidJoinToken
}
token, err := s.store.GetValidJoinTokenByHash(ctx, input.ClusterID, tokenHash)
if err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return NodeJoinRequest{}, ErrInvalidJoinToken
}
return NodeJoinRequest{}, err
}
item, err := s.store.CreateJoinRequest(ctx, input, token.ID)
if err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return NodeJoinRequest{}, ErrInvalidJoinToken
}
return NodeJoinRequest{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
EventType: "node_join_request.created",
TargetType: "node_join_request",
TargetID: &item.ID,
Payload: json.RawMessage(`{"source":"node_agent"}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) ListJoinRequests(ctx context.Context, actorUserID, clusterID string) ([]NodeJoinRequest, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListJoinRequests(ctx, clusterID)
}
func (s *Service) GetJoinRequestBootstrap(ctx context.Context, input GetJoinRequestBootstrapInput) (JoinRequestBootstrapResult, error) {
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.JoinRequestID = strings.TrimSpace(input.JoinRequestID)
input.NodeFingerprint = strings.TrimSpace(input.NodeFingerprint)
input.PublicKey = strings.TrimSpace(input.PublicKey)
if input.ClusterID == "" || input.JoinRequestID == "" || input.NodeFingerprint == "" || input.PublicKey == "" {
return JoinRequestBootstrapResult{}, ErrInvalidJoinRequest
}
item, err := s.store.GetJoinRequestForBootstrap(ctx, input)
if errors.Is(err, pgx.ErrNoRows) {
return JoinRequestBootstrapResult{}, ErrInvalidJoinRequest
}
if err != nil {
return JoinRequestBootstrapResult{}, err
}
result := JoinRequestBootstrapResult{Status: item.Status, JoinRequest: item}
if item.Status != JoinRequestStatusApproved {
return result, nil
}
bootstrap, updated, err := s.bootstrapForApprovedJoinRequest(ctx, item)
if err != nil {
return JoinRequestBootstrapResult{}, err
}
result.JoinRequest = updated
result.Bootstrap = &bootstrap
return result, nil
}
func (s *Service) RevokeJoinToken(ctx context.Context, input RevokeJoinTokenInput) (NodeJoinToken, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return NodeJoinToken{}, err
}
item, err := s.store.RevokeJoinToken(ctx, input)
if errors.Is(err, pgx.ErrNoRows) {
return NodeJoinToken{}, ErrInvalidJoinToken
}
if err != nil {
return NodeJoinToken{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "node_join_token.revoked",
TargetType: "node_join_token",
TargetID: &input.TokenID,
Payload: json.RawMessage(`{}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) ApproveJoinRequest(ctx context.Context, input ApproveJoinRequestInput) (ApprovedJoinRequest, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return ApprovedJoinRequest{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return ApprovedJoinRequest{}, err
}
if input.ClusterID == "" || input.JoinRequestID == "" {
return ApprovedJoinRequest{}, ErrInvalidJoinRequest
}
item, err := s.store.ApproveJoinRequest(ctx, input)
if errors.Is(err, pgx.ErrNoRows) {
return ApprovedJoinRequest{}, ErrInvalidJoinRequest
}
if err != nil {
return ApprovedJoinRequest{}, err
}
item, err = s.signApprovedJoinRequest(ctx, input, item)
if err != nil {
return ApprovedJoinRequest{}, err
}
return item, nil
}
func (s *Service) signApprovedJoinRequest(ctx context.Context, input ApproveJoinRequestInput, item ApprovedJoinRequest) (ApprovedJoinRequest, error) {
authorityKey, err := s.ensureClusterAuthority(ctx, input.ClusterID, &input.ActorUserID)
if err != nil {
return ApprovedJoinRequest{}, err
}
if item.Bootstrap.HeartbeatEndpoint == "" {
item.Bootstrap.HeartbeatEndpoint = nodeHeartbeatEndpoint(input.ClusterID, item.Bootstrap.NodeID)
}
payload := clusterNodeApprovalAuthorityPayload{
SchemaVersion: clusterNodeApprovalAuthoritySchema,
ClusterID: input.ClusterID,
JoinRequestID: item.JoinRequest.ID,
NodeID: item.Bootstrap.NodeID,
NodeFingerprint: item.JoinRequest.NodeFingerprint,
IdentityStatus: item.Bootstrap.IdentityStatus,
HeartbeatEndpoint: item.Bootstrap.HeartbeatEndpoint,
ApprovedByUserID: input.ActorUserID,
IssuedAt: s.now(),
ControlPlaneOnly: true,
ProductionForwarding: false,
}
rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now())
if err != nil {
return ApprovedJoinRequest{}, err
}
updated, err := s.store.SetJoinRequestApprovalAuthority(ctx, input.ClusterID, item.JoinRequest.ID, rawPayload, signature)
if err != nil {
return ApprovedJoinRequest{}, err
}
item.JoinRequest = updated
item.Bootstrap.ClusterAuthority = authorityDescriptor(authorityKey)
item.Bootstrap.AuthorityPayload = rawPayload
item.Bootstrap.AuthoritySignature = &signature
return item, nil
}
func (s *Service) bootstrapForApprovedJoinRequest(ctx context.Context, item NodeJoinRequest) (NodeBootstrap, NodeJoinRequest, error) {
if item.Status != JoinRequestStatusApproved || item.ApprovedNodeID == nil || strings.TrimSpace(*item.ApprovedNodeID) == "" {
return NodeBootstrap{}, NodeJoinRequest{}, ErrInvalidJoinRequest
}
authorityKey, err := s.ensureClusterAuthority(ctx, item.ClusterID, item.ReviewedByUserID)
if err != nil {
return NodeBootstrap{}, NodeJoinRequest{}, err
}
heartbeatEndpoint := nodeHeartbeatEndpoint(item.ClusterID, *item.ApprovedNodeID)
identityStatus := NodeRegistrationActive
if rawMessageEmpty(item.ApprovalPayload) || rawMessageEmpty(item.ApprovalSignature) {
approvedBy := "system"
if item.ReviewedByUserID != nil && strings.TrimSpace(*item.ReviewedByUserID) != "" {
approvedBy = strings.TrimSpace(*item.ReviewedByUserID)
}
payload := clusterNodeApprovalAuthorityPayload{
SchemaVersion: clusterNodeApprovalAuthoritySchema,
ClusterID: item.ClusterID,
JoinRequestID: item.ID,
NodeID: *item.ApprovedNodeID,
NodeFingerprint: item.NodeFingerprint,
IdentityStatus: identityStatus,
HeartbeatEndpoint: heartbeatEndpoint,
ApprovedByUserID: approvedBy,
IssuedAt: s.now(),
ControlPlaneOnly: true,
ProductionForwarding: false,
}
rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now())
if err != nil {
return NodeBootstrap{}, NodeJoinRequest{}, err
}
item, err = s.store.SetJoinRequestApprovalAuthority(ctx, item.ClusterID, item.ID, rawPayload, signature)
if err != nil {
return NodeBootstrap{}, NodeJoinRequest{}, err
}
} else {
var signature ClusterSignature
if err := json.Unmarshal(item.ApprovalSignature, &signature); err != nil {
return NodeBootstrap{}, NodeJoinRequest{}, err
}
if err := clusterauth.VerifyRaw(authorityKey.PublicKey, item.ApprovalPayload, signature); err != nil {
return NodeBootstrap{}, NodeJoinRequest{}, err
}
}
var signature ClusterSignature
if err := json.Unmarshal(item.ApprovalSignature, &signature); err != nil {
return NodeBootstrap{}, NodeJoinRequest{}, err
}
bootstrap := NodeBootstrap{
NodeID: *item.ApprovedNodeID,
ClusterID: item.ClusterID,
IdentityStatus: identityStatus,
Certificate: map[string]any{
"status": "pending_issuer_integration",
},
HeartbeatEndpoint: heartbeatEndpoint,
ClusterAuthority: authorityDescriptor(authorityKey),
AuthorityPayload: item.ApprovalPayload,
AuthoritySignature: &signature,
}
return bootstrap, item, nil
}
func nodeHeartbeatEndpoint(clusterID, nodeID string) string {
return "/api/v1/clusters/" + clusterID + "/nodes/" + nodeID + "/heartbeats"
}
func rawMessageEmpty(raw json.RawMessage) bool {
value := strings.TrimSpace(string(raw))
return value == "" || value == "{}" || value == "null"
}
func (s *Service) RejectJoinRequest(ctx context.Context, input RejectJoinRequestInput) (NodeJoinRequest, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return NodeJoinRequest{}, err
}
input.Reason = strings.TrimSpace(input.Reason)
if input.Reason == "" {
input.Reason = "Rejected by platform administrator."
}
item, err := s.store.RejectJoinRequest(ctx, input)
if errors.Is(err, pgx.ErrNoRows) {
return NodeJoinRequest{}, ErrInvalidJoinRequest
}
return item, err
}
func (s *Service) AssignNodeRole(ctx context.Context, input AssignNodeRoleInput) (NodeRoleAssignment, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return NodeRoleAssignment{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return NodeRoleAssignment{}, err
}
if !isAllowedNodeRole(input.Role) {
return NodeRoleAssignment{}, ErrInvalidNodeRole
}
if input.Status == "" {
input.Status = "active"
}
if input.Status != "active" && input.Status != "disabled" && input.Status != "revoked" {
return NodeRoleAssignment{}, ErrInvalidPayload
}
input.Policy = defaultJSON(input.Policy, `{}`)
if !json.Valid(input.Policy) {
return NodeRoleAssignment{}, errors.New("policy must be valid json")
}
item, err := s.store.AssignNodeRole(ctx, input)
if err != nil {
return NodeRoleAssignment{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "node_role." + input.Status,
TargetType: "node",
TargetID: &input.NodeID,
Payload: json.RawMessage(`{"capability_is_not_permission":true}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) ListNodeRoleAssignments(ctx context.Context, actorUserID, clusterID, nodeID string) ([]NodeRoleAssignment, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListNodeRoleAssignments(ctx, clusterID, nodeID)
}
func (s *Service) AttachExistingNodeToCluster(ctx context.Context, input AttachExistingNodeInput) (ClusterNode, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return ClusterNode{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return ClusterNode{}, err
}
if input.ClusterID == "" || input.NodeID == "" {
return ClusterNode{}, ErrInvalidPayload
}
for _, role := range input.Roles {
if !isAllowedNodeRole(role) {
return ClusterNode{}, ErrInvalidNodeRole
}
}
item, err := s.store.AttachExistingNodeToCluster(ctx, input)
if errors.Is(err, pgx.ErrNoRows) {
return ClusterNode{}, ErrInvalidPayload
}
return item, err
}
func (s *Service) AssignNodeToGroup(ctx context.Context, input AssignNodeGroupInput) (ClusterNode, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return ClusterNode{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return ClusterNode{}, err
}
if input.ClusterID == "" || input.NodeID == "" {
return ClusterNode{}, ErrInvalidPayload
}
if input.GroupID != nil {
trimmed := strings.TrimSpace(*input.GroupID)
if trimmed == "" {
input.GroupID = nil
} else {
input.GroupID = &trimmed
}
}
item, err := s.store.AssignNodeToGroup(ctx, input)
if errors.Is(err, pgx.ErrNoRows) {
return ClusterNode{}, ErrInvalidPayload
}
return item, err
}
func (s *Service) RevokeNodeIdentity(ctx context.Context, input RevokeNodeIdentityInput) error {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return err
}
input.Reason = strings.TrimSpace(input.Reason)
if input.Reason == "" {
input.Reason = "revoked by platform administrator"
}
if err := s.store.RevokeNodeIdentity(ctx, input); err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return ErrInvalidPayload
}
return err
}
return nil
}
func (s *Service) DisableClusterMembership(ctx context.Context, input DisableMembershipInput) error {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return err
}
input.Reason = strings.TrimSpace(input.Reason)
if input.Reason == "" {
input.Reason = "disabled by platform administrator"
}
if err := s.store.DisableClusterMembership(ctx, input); err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return ErrInvalidPayload
}
return err
}
return nil
}
func (s *Service) DeleteClusterNode(ctx context.Context, input DeleteClusterNodeInput) error {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return err
}
input.Reason = strings.TrimSpace(input.Reason)
if input.ClusterID == "" || input.NodeID == "" {
return ErrInvalidPayload
}
if input.Reason == "" {
input.Reason = "deleted by platform administrator"
}
if err := s.store.DeleteClusterNode(ctx, input); err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return ErrInvalidPayload
}
return err
}
return nil
}
func (s *Service) RecordHeartbeat(ctx context.Context, input RecordHeartbeatInput) (NodeHeartbeat, error) {
if input.ClusterID == "" || input.NodeID == "" {
return NodeHeartbeat{}, ErrInvalidPayload
}
if input.HealthStatus == "" {
input.HealthStatus = "unknown"
}
input.Capabilities = defaultJSON(input.Capabilities, `{}`)
input.ServiceStates = defaultJSON(input.ServiceStates, `{}`)
input.Metadata = defaultJSON(input.Metadata, `{}`)
heartbeat, err := s.store.RecordHeartbeat(ctx, input)
if err != nil {
return NodeHeartbeat{}, err
}
_ = s.recordFabricServiceChannelRouteFeedback(ctx, heartbeat)
_ = s.autoWarmFabricServiceChannelRouteRebuildSnapshotsAfterHeartbeat(ctx, heartbeat)
return heartbeat, nil
}
func (s *Service) ListNodeHeartbeats(ctx context.Context, actorUserID, clusterID, nodeID string, limit int) ([]NodeHeartbeat, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, limit)
}
func (s *Service) ListFabricServiceChannelRouteFeedback(ctx context.Context, actorUserID string, input ListFabricServiceChannelRouteFeedbackInput) ([]FabricServiceChannelRouteFeedbackObservation, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.ReporterNodeID = strings.TrimSpace(input.ReporterNodeID)
input.RouteID = strings.TrimSpace(input.RouteID)
input.ServiceClass = strings.TrimSpace(input.ServiceClass)
input.FeedbackStatus = strings.TrimSpace(input.FeedbackStatus)
if input.ClusterID == "" {
return nil, ErrInvalidPayload
}
if input.Now.IsZero() {
input.Now = s.now()
}
observations, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, input)
if err != nil {
return nil, err
}
policy := s.fabricServiceChannelRecoveryPolicy(ctx, input.ClusterID)
intents, err := s.store.ListRouteIntents(ctx, input.ClusterID)
if err != nil {
return nil, err
}
report := serviceChannelRouteFeedbackReportWithPolicyAndProvenance(observations, input.Now, policy, fabricServiceChannelRouteProvenanceFromIntents(intents))
if report == nil {
return nil, nil
}
return report.Observations, nil
}
func (s *Service) ListFabricServiceChannelRouteRebuildAttempts(ctx context.Context, actorUserID string, input ListFabricServiceChannelRouteRebuildAttemptsInput) ([]FabricServiceChannelRouteRebuildAttempt, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.ReporterNodeID = strings.TrimSpace(input.ReporterNodeID)
input.RouteID = strings.TrimSpace(input.RouteID)
input.ReplacementRouteID = strings.TrimSpace(input.ReplacementRouteID)
input.ServiceClass = strings.TrimSpace(input.ServiceClass)
input.RebuildStatus = strings.TrimSpace(input.RebuildStatus)
input.RebuildRequestID = strings.TrimSpace(input.RebuildRequestID)
input.Generation = strings.TrimSpace(input.Generation)
input.FeedbackSource = strings.TrimSpace(input.FeedbackSource)
input.FeedbackChannelID = strings.TrimSpace(input.FeedbackChannelID)
input.FeedbackViolationStatus = strings.TrimSpace(input.FeedbackViolationStatus)
input.EnrichmentMode = strings.TrimSpace(input.EnrichmentMode)
if input.ClusterID == "" {
return nil, ErrInvalidPayload
}
if input.Offset < 0 {
input.Offset = 0
}
if input.EnrichmentMode == "" {
input.EnrichmentMode = "summary"
}
items, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, input)
if err != nil {
return nil, err
}
if input.EnrichmentMode != "deep" {
return stripFabricServiceChannelRouteRebuildCorrelation(items), nil
}
return s.enrichFabricServiceChannelRouteRebuildAttempts(ctx, input.ClusterID, items, s.now()), nil
}
func (s *Service) GetFabricServiceChannelRouteRebuildHealthSummary(ctx context.Context, actorUserID string, input GetFabricServiceChannelRouteRebuildHealthSummaryInput) (FabricServiceChannelRouteRebuildHealthSummary, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return FabricServiceChannelRouteRebuildHealthSummary{}, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
if input.ClusterID == "" {
return FabricServiceChannelRouteRebuildHealthSummary{}, ErrInvalidPayload
}
if input.Limit <= 0 || input.Limit > 500 {
input.Limit = 200
}
now := s.now()
if now.IsZero() {
now = time.Now().UTC()
}
items, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{
ClusterID: input.ClusterID,
Limit: input.Limit,
UseCachedSnapshot: true,
})
if err != nil {
return FabricServiceChannelRouteRebuildHealthSummary{}, err
}
items = s.enrichFabricServiceChannelRouteRebuildAttempts(ctx, input.ClusterID, items, now)
silences, err := s.store.ListFabricServiceChannelRouteRebuildAlertSilences(ctx, input.ClusterID, now)
if err != nil {
return FabricServiceChannelRouteRebuildHealthSummary{}, err
}
items = applyFabricServiceChannelRouteRebuildAlertSilences(items, silences)
summary := FabricServiceChannelRouteRebuildHealthSummary{
ClusterID: input.ClusterID,
ObservedAt: now.UTC(),
WindowLimit: input.Limit,
TotalAttempts: len(items),
CountsByGuardStatus: map[string]int{},
CountsByGuardSeverity: map[string]int{},
}
affectedNodes := map[string]struct{}{}
affectedRoutes := map[string]struct{}{}
feedbackBreakdowns := map[string]*fabricServiceChannelRebuildFeedbackBreakdownAccumulator{}
for _, item := range items {
severity := firstNonEmptyString(item.GuardSeverity, "unknown")
status := firstNonEmptyString(item.GuardStatus, "unknown")
summary.CountsByGuardSeverity[severity]++
summary.CountsByGuardStatus[status]++
switch severity {
case "good":
summary.GoodCount++
case "warn":
summary.WarnCount++
if !item.AlertSilenced {
summary.ActiveWarnCount++
}
case "bad":
summary.BadCount++
if !item.AlertSilenced {
summary.ActiveBadCount++
}
default:
summary.UnknownCount++
}
if item.AlertSilenced {
summary.SilencedCount++
}
if item.AlertResurfaced {
summary.ResurfacedCount++
}
if item.RebuildStatus == "applied" {
summary.AppliedCount++
} else if item.RebuildStatus != "" {
summary.PendingCount++
}
if (severity == "bad" || severity == "warn") && !item.AlertSilenced {
if item.ReporterNodeID != "" {
affectedNodes[item.ReporterNodeID] = struct{}{}
}
if item.RouteID != "" {
affectedRoutes[item.RouteID] = struct{}{}
}
}
if severity == "bad" && !item.AlertSilenced && len(summary.MostRecentBadAttempts) < 10 {
summary.MostRecentBadAttempts = append(summary.MostRecentBadAttempts, item)
}
if item.AlertResurfaced && len(summary.ResurfacedAttempts) < 10 {
summary.ResurfacedAttempts = append(summary.ResurfacedAttempts, item)
}
addFabricServiceChannelRebuildFeedbackBreakdown(feedbackBreakdowns, item, severity)
}
if accessTelemetry, err := s.GetFabricServiceChannelAccessTelemetry(ctx, actorUserID, GetFabricServiceChannelAccessTelemetryInput{
ClusterID: input.ClusterID,
Limit: input.Limit,
Now: now,
}); err == nil {
summary.AccessRouteDecisionCount = accessTelemetry.RouteDecisionChannelCount
summary.AccessReplacementCount = accessTelemetry.ReplacementDecisionCount
summary.AccessAppliedCount = accessTelemetry.AppliedRebuildDecisionCount
summary.AccessRecoveryCount = accessTelemetry.RecoveryDecisionCount
summary.AccessNoSafeCount = accessTelemetry.NoSafeRecoveryDecisionCount
accessIncidents := append(
fabricServiceChannelAccessDecisionIncidents(input.ClusterID, accessTelemetry),
fabricServiceChannelDataPlaneContractIncidents(input.ClusterID, accessTelemetry)...,
)
for _, incident := range applyFabricServiceChannelAccessDecisionIncidentSilences(accessIncidents, silences) {
summary.CountsByGuardStatus[incident.GuardStatus]++
summary.CountsByGuardSeverity[incident.GuardSeverity]++
if incident.AlertSilenced {
summary.SilencedCount++
}
if incident.AlertResurfaced {
summary.ResurfacedCount++
}
switch incident.GuardSeverity {
case "good":
summary.GoodCount++
case "warn":
summary.WarnCount++
if !incident.AlertSilenced {
summary.ActiveWarnCount++
}
case "bad":
summary.BadCount++
if !incident.AlertSilenced {
summary.ActiveBadCount++
}
default:
summary.UnknownCount++
}
if (incident.GuardSeverity == "bad" || incident.GuardSeverity == "warn") && !incident.AlertSilenced {
if incident.ReporterNodeID != "" {
affectedNodes[incident.ReporterNodeID] = struct{}{}
}
if incident.RouteID != "" {
affectedRoutes[incident.RouteID] = struct{}{}
}
}
}
}
summary.AffectedReporterNodeIDs = sortedStringSetKeys(affectedNodes)
summary.AffectedRouteIDs = sortedStringSetKeys(affectedRoutes)
summary.FeedbackBreakdowns = sortedFabricServiceChannelRebuildFeedbackBreakdowns(feedbackBreakdowns)
summary.RecommendedOperatorAction = fabricServiceChannelRebuildRecommendedAction(summary)
return summary, nil
}
type fabricServiceChannelRebuildFeedbackBreakdownAccumulator struct {
item FabricServiceChannelRouteRebuildFeedbackHealthBreakdown
nodes map[string]struct{}
routes map[string]struct{}
}
func addFabricServiceChannelRebuildFeedbackBreakdown(out map[string]*fabricServiceChannelRebuildFeedbackBreakdownAccumulator, attempt FabricServiceChannelRouteRebuildAttempt, severity string) {
payload := jsonObject(attempt.Payload)
source := firstNonEmptyString(attempt.FeedbackSource, jsonString(payload, "feedback_source"))
channelID := firstNonEmptyString(attempt.FeedbackChannelID, jsonString(payload, "feedback_channel_id"))
violationStatus := firstNonEmptyString(attempt.FeedbackViolationStatus, jsonString(payload, "feedback_violation_status"))
if source == "" && channelID == "" && violationStatus == "" {
return
}
key := source + "\x00" + channelID + "\x00" + violationStatus
acc := out[key]
if acc == nil {
acc = &fabricServiceChannelRebuildFeedbackBreakdownAccumulator{
item: FabricServiceChannelRouteRebuildFeedbackHealthBreakdown{
FeedbackSource: source,
FeedbackChannelID: channelID,
FeedbackViolationStatus: violationStatus,
},
nodes: map[string]struct{}{},
routes: map[string]struct{}{},
}
out[key] = acc
}
acc.item.TotalCount++
switch severity {
case "good":
acc.item.GoodCount++
case "warn":
acc.item.WarnCount++
if !attempt.AlertSilenced {
acc.item.ActiveWarnCount++
}
case "bad":
acc.item.BadCount++
if !attempt.AlertSilenced {
acc.item.ActiveBadCount++
}
default:
acc.item.UnknownCount++
}
if attempt.AlertSilenced {
acc.item.SilencedCount++
}
observedAt := time.Time{}
if attempt.FeedbackObservedAt != nil {
observedAt = attempt.FeedbackObservedAt.UTC()
} else if value := strings.TrimSpace(jsonString(payload, "feedback_observed_at")); value != "" {
if parsed, err := time.Parse(time.RFC3339Nano, value); err == nil {
observedAt = parsed.UTC()
}
}
if observedAt.IsZero() {
observedAt = attempt.UpdatedAt.UTC()
}
if observedAt.After(acc.item.LatestObservedAt) {
acc.item.LatestObservedAt = observedAt
}
if attempt.ReporterNodeID != "" {
acc.nodes[attempt.ReporterNodeID] = struct{}{}
}
if attempt.RouteID != "" {
acc.routes[attempt.RouteID] = struct{}{}
}
}
func sortedFabricServiceChannelRebuildFeedbackBreakdowns(input map[string]*fabricServiceChannelRebuildFeedbackBreakdownAccumulator) []FabricServiceChannelRouteRebuildFeedbackHealthBreakdown {
out := make([]FabricServiceChannelRouteRebuildFeedbackHealthBreakdown, 0, len(input))
for _, acc := range input {
item := acc.item
item.AffectedReporterNodeIDs = sortedStringSetKeys(acc.nodes)
item.AffectedRouteIDs = sortedStringSetKeys(acc.routes)
out = append(out, item)
}
sort.SliceStable(out, func(i, j int) bool {
leftActive := out[i].ActiveBadCount*100000 + out[i].ActiveWarnCount*1000 + out[i].TotalCount
rightActive := out[j].ActiveBadCount*100000 + out[j].ActiveWarnCount*1000 + out[j].TotalCount
if leftActive != rightActive {
return leftActive > rightActive
}
if !out[i].LatestObservedAt.Equal(out[j].LatestObservedAt) {
return out[i].LatestObservedAt.After(out[j].LatestObservedAt)
}
left := out[i].FeedbackSource + out[i].FeedbackChannelID + out[i].FeedbackViolationStatus
right := out[j].FeedbackSource + out[j].FeedbackChannelID + out[j].FeedbackViolationStatus
return left < right
})
if len(out) > 100 {
out = out[:100]
}
return out
}
func (s *Service) GetFabricServiceChannelReadiness(ctx context.Context, actorUserID string, input GetFabricServiceChannelReadinessInput) (FabricServiceChannelReadiness, error) {
if input.Limit <= 0 || input.Limit > 5 {
input.Limit = 5
}
summary, err := s.GetFabricServiceChannelRouteRebuildHealthSummary(ctx, actorUserID, GetFabricServiceChannelRouteRebuildHealthSummaryInput{
ClusterID: input.ClusterID,
Limit: input.Limit,
})
if err != nil {
return FabricServiceChannelReadiness{}, err
}
return fabricServiceChannelReadinessFromRebuildHealth(summary), nil
}
func (s *Service) GetFabricServiceChannelSchemaStatus(ctx context.Context, actorUserID string, input GetFabricServiceChannelSchemaStatusInput) (FabricServiceChannelSchemaStatus, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return FabricServiceChannelSchemaStatus{}, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
if input.ClusterID == "" {
return FabricServiceChannelSchemaStatus{}, ErrInvalidPayload
}
return s.store.GetFabricServiceChannelSchemaStatus(ctx, input)
}
func (s *Service) GetFabricServiceChannelRebuildSnapshotMaintenanceHealth(ctx context.Context, actorUserID string, input GetFabricServiceChannelRebuildSnapshotMaintenanceHealthInput) (FabricServiceChannelRebuildSnapshotMaintenanceHealth, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return FabricServiceChannelRebuildSnapshotMaintenanceHealth{}, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
if input.ClusterID == "" {
return FabricServiceChannelRebuildSnapshotMaintenanceHealth{}, ErrInvalidPayload
}
if input.Limit <= 0 {
input.Limit = 50
}
if input.Limit > 100 {
input.Limit = 100
}
if input.MinAgeSeconds <= 0 {
input.MinAgeSeconds = 60
}
if input.MinAgeSeconds > 3600 {
input.MinAgeSeconds = 3600
}
if input.HeartbeatThreshold <= 0 {
input.HeartbeatThreshold = 2
}
if input.HeartbeatThreshold > 10 {
input.HeartbeatThreshold = 10
}
now := s.now()
if now.IsZero() {
now = time.Now().UTC()
}
out := FabricServiceChannelRebuildSnapshotMaintenanceHealth{
ClusterID: input.ClusterID,
ObservedAt: now.UTC(),
Status: "ready",
Reason: "snapshot_maintenance_ready",
WindowLimit: input.Limit,
MinAgeSeconds: input.MinAgeSeconds,
HeartbeatThreshold: input.HeartbeatThreshold,
}
attempts, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{
ClusterID: input.ClusterID,
Limit: input.Limit,
})
if err != nil {
return FabricServiceChannelRebuildSnapshotMaintenanceHealth{}, err
}
heartbeatsByNode := map[string][]NodeHeartbeat{}
nodes := map[string]*FabricServiceChannelRebuildSnapshotNodeHealth{}
nodeHealth := func(nodeID string) *FabricServiceChannelRebuildSnapshotNodeHealth {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" {
nodeID = "unknown"
}
if item, ok := nodes[nodeID]; ok {
return item
}
item := &FabricServiceChannelRebuildSnapshotNodeHealth{NodeID: nodeID}
nodes[nodeID] = item
return item
}
for _, attempt := range attempts {
out.RecentAttemptCount++
node := nodeHealth(attempt.ReporterNodeID)
node.RecentAttemptCount++
if fabricServiceChannelRouteRebuildHasCorrelationSnapshot(attempt) {
out.ValidSnapshotCount++
node.ValidSnapshotCount++
continue
}
out.MissingSnapshotCount++
node.MissingSnapshotCount++
ageSeconds := int64(now.Sub(attempt.UpdatedAt).Seconds())
if ageSeconds < input.MinAgeSeconds {
continue
}
reporterNodeID := strings.TrimSpace(attempt.ReporterNodeID)
if reporterNodeID == "" {
continue
}
heartbeats, ok := heartbeatsByNode[reporterNodeID]
if !ok {
heartbeats, err = s.store.ListNodeHeartbeats(ctx, input.ClusterID, reporterNodeID, input.HeartbeatThreshold+5)
if err != nil {
heartbeats = nil
}
heartbeatsByNode[reporterNodeID] = heartbeats
}
heartbeatAfterAttemptCount := 0
for _, heartbeat := range heartbeats {
observedAt := heartbeat.ObservedAt
if node.LastHeartbeatAt == nil || observedAt.After(*node.LastHeartbeatAt) {
value := observedAt
node.LastHeartbeatAt = &value
}
if observedAt.After(attempt.UpdatedAt) || observedAt.Equal(attempt.UpdatedAt) {
heartbeatAfterAttemptCount++
}
}
if heartbeatAfterAttemptCount > node.HeartbeatAfterAttemptCount {
node.HeartbeatAfterAttemptCount = heartbeatAfterAttemptCount
}
if heartbeatAfterAttemptCount >= input.HeartbeatThreshold {
out.OverdueMissingSnapshotCount++
node.OverdueMissingSnapshotCount++
if len(out.OverdueMissingSnapshotAttempts) < 10 {
out.OverdueMissingSnapshotAttempts = append(out.OverdueMissingSnapshotAttempts, attempt)
}
}
}
events, err := s.store.ListAuditEvents(ctx, ListAuditEventsInput{
ClusterID: input.ClusterID,
EventTypes: []string{"fabric.service_channel_rebuild_snapshot.auto_warmup"},
Limit: 100,
})
if err != nil {
return FabricServiceChannelRebuildSnapshotMaintenanceHealth{}, err
}
for _, event := range events {
if event.EventType != "fabric.service_channel_rebuild_snapshot.auto_warmup" {
continue
}
payload := jsonObject(event.Payload)
nodeID := jsonString(payload, "reporter_node_id")
node := nodeHealth(nodeID)
out.AutoWarmupEventCount++
out.AutoWarmupWarmedCount += jsonInt(payload, "warmed_count")
out.AutoWarmupAlreadyFreshCount += jsonInt(payload, "already_fresh_count")
out.AutoWarmupErrorCount += jsonInt(payload, "error_count")
node.AutoWarmupEventCount++
node.AutoWarmupWarmedCount += jsonInt(payload, "warmed_count")
node.AutoWarmupErrorCount += jsonInt(payload, "error_count")
createdAt := event.CreatedAt
if out.LatestAutoWarmupAt == nil || createdAt.After(*out.LatestAutoWarmupAt) {
value := createdAt
out.LatestAutoWarmupAt = &value
}
if node.LatestAutoWarmupAt == nil || createdAt.After(*node.LatestAutoWarmupAt) {
value := createdAt
node.LatestAutoWarmupAt = &value
}
}
out.Nodes = make([]FabricServiceChannelRebuildSnapshotNodeHealth, 0, len(nodes))
for _, item := range nodes {
out.Nodes = append(out.Nodes, *item)
}
sort.Slice(out.Nodes, func(i, j int) bool {
if out.Nodes[i].OverdueMissingSnapshotCount != out.Nodes[j].OverdueMissingSnapshotCount {
return out.Nodes[i].OverdueMissingSnapshotCount > out.Nodes[j].OverdueMissingSnapshotCount
}
if out.Nodes[i].MissingSnapshotCount != out.Nodes[j].MissingSnapshotCount {
return out.Nodes[i].MissingSnapshotCount > out.Nodes[j].MissingSnapshotCount
}
return out.Nodes[i].NodeID < out.Nodes[j].NodeID
})
if out.AutoWarmupErrorCount > 0 {
out.Status = "degraded"
out.Reason = "auto_warmup_errors_seen"
out.RecommendedOperatorAction = "Check backend logs and heartbeat metadata for nodes with auto-warmup errors."
}
if out.OverdueMissingSnapshotCount > 0 {
out.Status = "degraded"
out.Reason = "snapshot_warmup_overdue"
out.RecommendedOperatorAction = "Run warm snapshots or inspect reporter nodes whose heartbeat evidence is not producing rebuild snapshots."
}
if out.MissingSnapshotCount > 0 && out.OverdueMissingSnapshotCount == 0 && out.RecommendedOperatorAction == "" {
out.RecommendedOperatorAction = "Recent attempts are still waiting for runtime heartbeat evidence."
}
return out, nil
}
func (s *Service) WarmupFabricServiceChannelRebuildSnapshots(ctx context.Context, input WarmupFabricServiceChannelRebuildSnapshotsInput) (FabricServiceChannelRebuildSnapshotWarmup, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return FabricServiceChannelRebuildSnapshotWarmup{}, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
if input.ClusterID == "" {
return FabricServiceChannelRebuildSnapshotWarmup{}, ErrInvalidPayload
}
if input.Limit <= 0 || input.Limit > 50 {
input.Limit = 10
}
if input.StaleAfterSeconds <= 0 || input.StaleAfterSeconds > int64((24*time.Hour).Seconds()) {
input.StaleAfterSeconds = 60
}
now := input.Now
if now.IsZero() {
now = s.now()
}
if now.IsZero() {
now = time.Now().UTC()
}
result := FabricServiceChannelRebuildSnapshotWarmup{
ClusterID: input.ClusterID,
ObservedAt: now.UTC(),
WindowLimit: input.Limit,
StaleAfterSeconds: input.StaleAfterSeconds,
Status: "ready",
Reason: "snapshots_warmed",
}
items, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{
ClusterID: input.ClusterID,
Limit: input.Limit,
})
if err != nil {
return FabricServiceChannelRebuildSnapshotWarmup{}, err
}
result.ScannedCount = len(items)
heartbeatsByNode := map[string][]NodeHeartbeat{}
staleAfter := time.Duration(input.StaleAfterSeconds) * time.Second
for _, item := range items {
if !fabricServiceChannelRouteRebuildHasCorrelationSnapshot(item) {
result.MissingSnapshotCount++
} else if fabricServiceChannelRouteRebuildSnapshotIsStale(item, now, staleAfter) {
result.StaleSnapshotCount++
result.DeferredStaleCount++
continue
} else {
result.AlreadyFreshCount++
continue
}
nodeID := strings.TrimSpace(item.ReporterNodeID)
if nodeID == "" {
result.ErrorCount++
continue
}
if _, ok := heartbeatsByNode[nodeID]; !ok {
heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, nodeID, 120)
if err != nil {
result.ErrorCount++
heartbeats = nil
}
heartbeatsByNode[nodeID] = heartbeats
}
item = enrichFabricServiceChannelRouteRebuildAttempt(item, heartbeatsByNode[nodeID], now)
item.CorrelationSnapshotAt = &now
if err := s.store.UpdateFabricServiceChannelRouteRebuildCorrelationSnapshot(ctx, fabricServiceChannelRouteRebuildCorrelationSnapshotInput(item, now)); err != nil {
result.ErrorCount++
continue
}
result.WarmedCount++
}
if result.ErrorCount > 0 {
result.Status = "degraded"
result.Reason = "snapshot_warmup_partial"
result.RecommendedOperatorAction = "Check node heartbeat history and backend logs for rebuild snapshot warmup failures."
} else if result.DeferredStaleCount > 0 {
result.Status = "ready"
result.Reason = "missing_snapshots_warmed_stale_deferred"
result.RecommendedOperatorAction = "Stale snapshots were detected and left cached; age-sensitive guard state is recomputed on read."
}
return result, nil
}
func (s *Service) ListFabricServiceChannelRouteRebuildIncidents(ctx context.Context, actorUserID string, input ListFabricServiceChannelRouteRebuildIncidentsInput) ([]FabricServiceChannelRouteRebuildIncident, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
if input.ClusterID == "" {
return nil, ErrInvalidPayload
}
if input.Limit <= 0 || input.Limit > 5 {
input.Limit = 5
}
now := s.now()
if now.IsZero() {
now = time.Now().UTC()
}
items, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{
ClusterID: input.ClusterID,
Limit: input.Limit,
UseCachedSnapshot: true,
})
if err != nil {
return nil, err
}
items = s.enrichFabricServiceChannelRouteRebuildAttempts(ctx, input.ClusterID, items, now)
silences, err := s.store.ListFabricServiceChannelRouteRebuildAlertSilences(ctx, input.ClusterID, now)
if err != nil {
return nil, err
}
items = applyFabricServiceChannelRouteRebuildAlertSilences(items, silences)
incidents := fabricServiceChannelRouteRebuildIncidentsFromAttempts(input.ClusterID, items)
if accessTelemetry, err := s.GetFabricServiceChannelAccessTelemetry(ctx, actorUserID, GetFabricServiceChannelAccessTelemetryInput{
ClusterID: input.ClusterID,
Limit: input.Limit,
Now: now,
}); err == nil {
accessIncidents := append(
fabricServiceChannelAccessDecisionIncidents(input.ClusterID, accessTelemetry),
fabricServiceChannelDataPlaneContractIncidents(input.ClusterID, accessTelemetry)...,
)
incidents = append(incidents, applyFabricServiceChannelAccessDecisionIncidentSilences(accessIncidents, silences)...)
fabricServiceChannelSortRouteRebuildIncidents(incidents)
}
if len(incidents) > input.Limit {
incidents = incidents[:input.Limit]
}
return incidents, nil
}
func (s *Service) RecordFabricServiceChannelRouteRebuildInvestigation(ctx context.Context, input RecordFabricServiceChannelRouteRebuildInvestigationInput) error {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.ReporterNodeID = strings.TrimSpace(input.ReporterNodeID)
input.RouteID = strings.TrimSpace(input.RouteID)
input.ServiceClass = strings.TrimSpace(input.ServiceClass)
input.Generation = strings.TrimSpace(input.Generation)
input.GuardStatus = strings.TrimSpace(input.GuardStatus)
input.IncidentID = strings.TrimSpace(input.IncidentID)
input.FeedbackSource = strings.TrimSpace(input.FeedbackSource)
input.FeedbackChannelID = strings.TrimSpace(input.FeedbackChannelID)
input.FeedbackViolationStatus = strings.TrimSpace(input.FeedbackViolationStatus)
input.DrilldownSource = strings.TrimSpace(input.DrilldownSource)
input.Reason = strings.TrimSpace(input.Reason)
if input.ClusterID == "" || (input.ReporterNodeID == "" && input.RouteID == "" && input.FeedbackSource == "" && input.FeedbackChannelID == "" && input.FeedbackViolationStatus == "") {
return ErrInvalidPayload
}
now := input.Now
if now.IsZero() {
now = s.now()
}
if now.IsZero() {
now = time.Now().UTC()
}
eventType := "fabric.service_channel_rebuild_incident.investigation_opened"
targetType := "fabric_service_channel_route_rebuild_incident"
targetIDValue := firstNonEmptyString(input.RouteID, input.FeedbackChannelID, input.FeedbackViolationStatus, input.FeedbackSource, input.ReporterNodeID)
if input.DrilldownSource == "rebuild_health_feedback_breakdown" || input.FeedbackSource != "" || input.FeedbackChannelID != "" || input.FeedbackViolationStatus != "" {
eventType = "fabric.service_channel_rebuild_feedback_breakdown.investigation_opened"
targetType = "fabric_service_channel_rebuild_feedback_breakdown"
}
return s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: eventType,
TargetType: targetType,
TargetID: &targetIDValue,
Payload: mustJSONRaw(map[string]any{
"incident_id": input.IncidentID,
"reporter_node_id": input.ReporterNodeID,
"route_id": input.RouteID,
"service_class": input.ServiceClass,
"generation": input.Generation,
"guard_status": input.GuardStatus,
"feedback_source": input.FeedbackSource,
"feedback_channel_id": input.FeedbackChannelID,
"feedback_violation_status": input.FeedbackViolationStatus,
"drilldown_source": input.DrilldownSource,
"reason": input.Reason,
}),
CreatedAt: now.UTC(),
})
}
func (s *Service) SilenceFabricServiceChannelRouteRebuildAlert(ctx context.Context, input SilenceFabricServiceChannelRouteRebuildAlertInput) (FabricServiceChannelRouteRebuildAlertSilence, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return FabricServiceChannelRouteRebuildAlertSilence{}, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.ReporterNodeID = strings.TrimSpace(input.ReporterNodeID)
input.RouteID = strings.TrimSpace(input.RouteID)
input.GuardStatus = strings.TrimSpace(input.GuardStatus)
input.Generation = strings.TrimSpace(input.Generation)
input.Reason = strings.TrimSpace(input.Reason)
input.IncidentSource = strings.TrimSpace(input.IncidentSource)
input.ChannelID = strings.TrimSpace(input.ChannelID)
if input.ClusterID == "" || input.ReporterNodeID == "" || input.RouteID == "" || input.GuardStatus == "" {
return FabricServiceChannelRouteRebuildAlertSilence{}, ErrInvalidPayload
}
requestedRouteID := input.RouteID
if input.IncidentSource == "access_decision" || input.IncidentSource == "data_plane_contract" {
if input.ChannelID == "" {
return FabricServiceChannelRouteRebuildAlertSilence{}, ErrInvalidPayload
}
input.RouteID = fabricServiceChannelAccessDecisionSilenceRouteID(input.ChannelID, input.RouteID)
}
if input.TTL <= 0 || input.TTL > 7*24*time.Hour {
input.TTL = 6 * time.Hour
}
now := input.Now
if now.IsZero() {
now = s.now()
}
if now.IsZero() {
now = time.Now().UTC()
}
expiresAt := now.UTC().Add(input.TTL)
silence, err := s.store.UpsertFabricServiceChannelRouteRebuildAlertSilence(ctx, input, expiresAt)
if err != nil {
return FabricServiceChannelRouteRebuildAlertSilence{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "fabric.service_channel_rebuild_alert.silenced",
TargetType: "fabric_service_channel_route_rebuild_alert",
TargetID: &input.RouteID,
Payload: mustJSONRaw(map[string]any{
"reporter_node_id": input.ReporterNodeID,
"route_id": requestedRouteID,
"stored_route_id": input.RouteID,
"incident_source": input.IncidentSource,
"channel_id": input.ChannelID,
"guard_status": input.GuardStatus,
"generation": input.Generation,
"reason": input.Reason,
"expires_at": expiresAt.UTC().Format(time.RFC3339Nano),
}),
CreatedAt: now.UTC(),
})
return silence, nil
}
func (s *Service) ListFabricServiceChannelRouteRebuildAlertSilences(ctx context.Context, actorUserID string, clusterID string, now time.Time) ([]FabricServiceChannelRouteRebuildAlertSilence, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
clusterID = strings.TrimSpace(clusterID)
if clusterID == "" {
return nil, ErrInvalidPayload
}
if now.IsZero() {
now = s.now()
}
if now.IsZero() {
now = time.Now().UTC()
}
return s.store.ListFabricServiceChannelRouteRebuildAlertSilences(ctx, clusterID, now)
}
func (s *Service) UnsilenceFabricServiceChannelRouteRebuildAlert(ctx context.Context, input UnsilenceFabricServiceChannelRouteRebuildAlertInput) (FabricServiceChannelRouteRebuildAlertSilence, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return FabricServiceChannelRouteRebuildAlertSilence{}, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.SilenceID = strings.TrimSpace(input.SilenceID)
input.Reason = strings.TrimSpace(input.Reason)
if input.ClusterID == "" || input.SilenceID == "" {
return FabricServiceChannelRouteRebuildAlertSilence{}, ErrInvalidPayload
}
now := input.Now
if now.IsZero() {
now = s.now()
}
if now.IsZero() {
now = time.Now().UTC()
}
silence, err := s.store.DeleteFabricServiceChannelRouteRebuildAlertSilence(ctx, input)
if err != nil {
return FabricServiceChannelRouteRebuildAlertSilence{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "fabric.service_channel_rebuild_alert.unsilenced",
TargetType: "fabric_service_channel_route_rebuild_alert_silence",
TargetID: &input.SilenceID,
Payload: mustJSONRaw(map[string]any{
"reporter_node_id": silence.ReporterNodeID,
"route_id": silence.DisplayRouteID,
"stored_route_id": silence.RouteID,
"incident_source": silence.IncidentSource,
"channel_id": silence.ChannelID,
"guard_status": silence.GuardStatus,
"generation": silence.Generation,
"reason": input.Reason,
"unsilenced_at": now.UTC().Format(time.RFC3339Nano),
}),
CreatedAt: now.UTC(),
})
return silence, nil
}
func (s *Service) enrichFabricServiceChannelRouteRebuildAttempts(ctx context.Context, clusterID string, items []FabricServiceChannelRouteRebuildAttempt, now time.Time) []FabricServiceChannelRouteRebuildAttempt {
if len(items) == 0 {
return items
}
if now.IsZero() {
now = time.Now().UTC()
}
heartbeatsByNode := map[string][]NodeHeartbeat{}
for idx := range items {
if fabricServiceChannelRouteRebuildHasCorrelationSnapshot(items[idx]) {
items[idx] = applyFabricServiceChannelRouteRebuildGuard(items[idx], now)
continue
}
nodeID := strings.TrimSpace(items[idx].ReporterNodeID)
if nodeID == "" {
continue
}
if _, ok := heartbeatsByNode[nodeID]; !ok {
heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 120)
if err != nil {
heartbeats = nil
}
heartbeatsByNode[nodeID] = heartbeats
}
items[idx] = enrichFabricServiceChannelRouteRebuildAttempt(items[idx], heartbeatsByNode[nodeID], now)
if fabricServiceChannelRouteRebuildHasRuntimeEvidence(items[idx]) {
items[idx].CorrelationSnapshotAt = &now
_ = s.store.UpdateFabricServiceChannelRouteRebuildCorrelationSnapshot(ctx, fabricServiceChannelRouteRebuildCorrelationSnapshotInput(items[idx], now))
}
}
return items
}
func fabricServiceChannelRouteRebuildHasCorrelationSnapshot(item FabricServiceChannelRouteRebuildAttempt) bool {
return item.CorrelationSnapshotAt != nil && fabricServiceChannelRouteRebuildHasRuntimeEvidence(item)
}
func fabricServiceChannelRouteRebuildHasRuntimeEvidence(item FabricServiceChannelRouteRebuildAttempt) bool {
return item.NodeTransitionMatched ||
item.NodeRouteGenerationMatched ||
item.PostRebuildSelectedRouteID != "" ||
item.PostRebuildSendPackets > 0 ||
item.PostRebuildSendFlowPackets > 0
}
func fabricServiceChannelRouteRebuildSnapshotIsStale(item FabricServiceChannelRouteRebuildAttempt, now time.Time, staleAfter time.Duration) bool {
if item.CorrelationSnapshotAt == nil {
return true
}
if staleAfter <= 0 {
return false
}
snapshotAt := item.CorrelationSnapshotAt.UTC()
if snapshotAt.IsZero() {
return true
}
if now.IsZero() {
now = time.Now().UTC()
}
return now.UTC().Sub(snapshotAt) > staleAfter
}
func stripFabricServiceChannelRouteRebuildCorrelation(items []FabricServiceChannelRouteRebuildAttempt) []FabricServiceChannelRouteRebuildAttempt {
for idx := range items {
items[idx].NodeTransitionStatus = ""
items[idx].NodeTransitionGeneration = ""
items[idx].NodeTransitionObservedAt = ""
items[idx].NodeTransitionMatched = false
items[idx].NodeRouteGenerationStatus = ""
items[idx].NodeRouteGenerationAppliedAt = ""
items[idx].NodeRouteGenerationWithdrawnAt = ""
items[idx].NodeRouteGenerationMatched = false
items[idx].PostRebuildSelectedRouteID = ""
items[idx].PostRebuildSendPackets = 0
items[idx].PostRebuildSendFailures = 0
items[idx].PostRebuildSendFlowPackets = 0
items[idx].PostRebuildSendFlowDropped = 0
items[idx].GuardStatus = ""
items[idx].GuardSeverity = ""
items[idx].GuardReason = ""
items[idx].GuardAgeSeconds = 0
items[idx].GuardTransitionDeadlineSeconds = 0
items[idx].GuardTrafficDeadlineSeconds = 0
items[idx].Timeline = nil
items[idx].CorrelationSnapshotAt = nil
}
return items
}
func fabricServiceChannelRouteRebuildCorrelationSnapshotInput(item FabricServiceChannelRouteRebuildAttempt, now time.Time) UpdateFabricServiceChannelRouteRebuildCorrelationSnapshotInput {
if now.IsZero() {
now = time.Now().UTC()
}
return UpdateFabricServiceChannelRouteRebuildCorrelationSnapshotInput{
ID: item.ID,
NodeTransitionStatus: item.NodeTransitionStatus,
NodeTransitionGeneration: item.NodeTransitionGeneration,
NodeTransitionObservedAt: item.NodeTransitionObservedAt,
NodeTransitionMatched: item.NodeTransitionMatched,
NodeRouteGenerationStatus: item.NodeRouteGenerationStatus,
NodeRouteGenerationAppliedAt: item.NodeRouteGenerationAppliedAt,
NodeRouteGenerationWithdrawnAt: item.NodeRouteGenerationWithdrawnAt,
NodeRouteGenerationMatched: item.NodeRouteGenerationMatched,
PostRebuildSelectedRouteID: item.PostRebuildSelectedRouteID,
PostRebuildSendPackets: item.PostRebuildSendPackets,
PostRebuildSendFailures: item.PostRebuildSendFailures,
PostRebuildSendFlowPackets: item.PostRebuildSendFlowPackets,
PostRebuildSendFlowDropped: item.PostRebuildSendFlowDropped,
GuardStatus: item.GuardStatus,
GuardSeverity: item.GuardSeverity,
GuardReason: item.GuardReason,
GuardTransitionDeadlineSeconds: item.GuardTransitionDeadlineSeconds,
GuardTrafficDeadlineSeconds: item.GuardTrafficDeadlineSeconds,
Timeline: item.Timeline,
CorrelationSnapshotAt: now.UTC(),
}
}
func enrichFabricServiceChannelRouteRebuildAttempt(item FabricServiceChannelRouteRebuildAttempt, heartbeats []NodeHeartbeat, now time.Time) FabricServiceChannelRouteRebuildAttempt {
item.Timeline = append(item.Timeline, FabricServiceChannelRouteRebuildTimelineEvent{
Stage: "backend_decision",
Status: firstNonEmptyString(item.RebuildStatus, "unknown"),
At: item.UpdatedAt.UTC().Format(time.RFC3339Nano),
RouteID: item.RouteID,
Generation: item.Generation,
Payload: mustJSONRaw(map[string]any{
"rebuild_request_id": item.RebuildRequestID,
"decision_source": item.DecisionSource,
"outcome": item.Outcome,
"replacement_route_id": item.ReplacementRouteID,
"rebuild_reason": item.RebuildReason,
}),
})
for _, heartbeat := range heartbeats {
metadata := jsonObject(heartbeat.Metadata)
runtime := jsonMapPath(metadata, "fabric_service_channel_runtime_report")
ingress := jsonMapPath(runtime, "ingress")
transition := jsonMapPath(ingress, "route_manager_transition")
if !item.NodeTransitionMatched && transitionMatchesRebuildAttempt(transition, item) {
item.NodeTransitionMatched = true
item.NodeTransitionStatus = jsonString(transition, "status")
item.NodeTransitionGeneration = jsonString(transition, "generation")
item.NodeTransitionObservedAt = firstNonEmptyString(jsonString(transition, "observed_at"), heartbeat.ObservedAt.UTC().Format(time.RFC3339Nano))
item.Timeline = append(item.Timeline, FabricServiceChannelRouteRebuildTimelineEvent{
Stage: "node_route_manager_transition",
Status: item.NodeTransitionStatus,
At: item.NodeTransitionObservedAt,
RouteID: item.RouteID,
Generation: item.NodeTransitionGeneration,
Payload: mustJSONRaw(transition),
})
}
routeGeneration := jsonMapPath(metadata, "mesh_route_generation_report")
if !item.NodeRouteGenerationMatched {
if decision, ok := routeGenerationDecisionForAttempt(routeGeneration, item); ok {
item.NodeRouteGenerationMatched = true
item.NodeRouteGenerationStatus = firstNonEmptyString(jsonString(decision, "status"), jsonString(decision, "apply_status"), jsonString(decision, "withdraw_status"))
item.NodeRouteGenerationAppliedAt = jsonString(decision, "applied_at")
item.NodeRouteGenerationWithdrawnAt = jsonString(decision, "withdrawn_at")
item.Timeline = append(item.Timeline, FabricServiceChannelRouteRebuildTimelineEvent{
Stage: "node_route_generation_apply",
Status: item.NodeRouteGenerationStatus,
At: firstNonEmptyString(item.NodeRouteGenerationAppliedAt, item.NodeRouteGenerationWithdrawnAt, heartbeat.ObservedAt.UTC().Format(time.RFC3339Nano)),
RouteID: item.RouteID,
Generation: jsonString(decision, "generation"),
Payload: mustJSONRaw(decision),
})
}
}
if item.PostRebuildSelectedRouteID == "" && !heartbeat.ObservedAt.Before(item.UpdatedAt) {
selectedRouteID := jsonString(ingress, "last_selected_route_id")
if selectedRouteID == item.ReplacementRouteID || selectedRouteID == item.RouteID || selectedRouteID != "" {
item.PostRebuildSelectedRouteID = selectedRouteID
item.PostRebuildSendPackets = jsonUint64(ingress, "send_packets")
item.PostRebuildSendFailures = jsonUint64(ingress, "send_route_failures")
item.PostRebuildSendFlowPackets = jsonUint64(ingress, "send_flow_packets")
item.PostRebuildSendFlowDropped = jsonUint64(ingress, "send_flow_dropped")
item.Timeline = append(item.Timeline, FabricServiceChannelRouteRebuildTimelineEvent{
Stage: "post_rebuild_traffic",
Status: "observed",
At: heartbeat.ObservedAt.UTC().Format(time.RFC3339Nano),
RouteID: selectedRouteID,
Generation: jsonString(runtime, "config_version"),
Payload: mustJSONRaw(map[string]any{
"last_selected_route_id": selectedRouteID,
"send_packets": item.PostRebuildSendPackets,
"send_route_failures": item.PostRebuildSendFailures,
"send_flow_packets": item.PostRebuildSendFlowPackets,
"send_flow_dropped": item.PostRebuildSendFlowDropped,
"recommended_parallel": jsonUint64(ingress, "recommended_parallel_flow_sends"),
}),
})
}
}
if item.NodeTransitionMatched && item.NodeRouteGenerationMatched && item.PostRebuildSelectedRouteID != "" {
break
}
}
sort.SliceStable(item.Timeline, func(i, j int) bool {
left, leftErr := time.Parse(time.RFC3339Nano, item.Timeline[i].At)
right, rightErr := time.Parse(time.RFC3339Nano, item.Timeline[j].At)
if leftErr == nil && rightErr == nil && !left.Equal(right) {
return left.Before(right)
}
return item.Timeline[i].Stage < item.Timeline[j].Stage
})
item = applyFabricServiceChannelRouteRebuildGuard(item, now)
return item
}
const (
fabricServiceChannelRebuildTransitionDeadline = 90 * time.Second
fabricServiceChannelRebuildTrafficDeadline = 180 * time.Second
)
func applyFabricServiceChannelRouteRebuildGuard(item FabricServiceChannelRouteRebuildAttempt, now time.Time) FabricServiceChannelRouteRebuildAttempt {
if now.IsZero() {
now = time.Now().UTC()
}
age := now.Sub(item.UpdatedAt)
if age < 0 {
age = 0
}
item.GuardAgeSeconds = int64(age / time.Second)
item.GuardTransitionDeadlineSeconds = int64(fabricServiceChannelRebuildTransitionDeadline / time.Second)
item.GuardTrafficDeadlineSeconds = int64(fabricServiceChannelRebuildTrafficDeadline / time.Second)
if item.RebuildStatus == "" {
item.GuardStatus = "unknown"
item.GuardSeverity = "warn"
item.GuardReason = "missing_backend_rebuild_status"
return item
}
if item.RebuildStatus == "pending_degraded_fallback" {
if item.NodeTransitionMatched {
item.GuardStatus = "pending_degraded_fallback_seen"
item.GuardSeverity = "warn"
item.GuardReason = "node_confirmed_pending_degraded_fallback"
return item
}
if age > fabricServiceChannelRebuildTransitionDeadline {
item.GuardStatus = "missing_node_transition"
item.GuardSeverity = "bad"
item.GuardReason = "node_did_not_report_pending_fallback_transition"
return item
}
item.GuardStatus = "pending_node_transition"
item.GuardSeverity = "warn"
item.GuardReason = "waiting_for_node_pending_fallback_transition"
return item
}
if item.RebuildStatus != "applied" {
item.GuardStatus = "not_applied"
item.GuardSeverity = "warn"
item.GuardReason = "backend_rebuild_not_applied"
return item
}
if !item.NodeTransitionMatched {
if age > fabricServiceChannelRebuildTransitionDeadline {
item.GuardStatus = "missing_node_transition"
item.GuardSeverity = "bad"
item.GuardReason = "node_did_not_report_applied_rebuild_transition"
return item
}
item.GuardStatus = "pending_node_transition"
item.GuardSeverity = "warn"
item.GuardReason = "waiting_for_node_applied_rebuild_transition"
return item
}
if !item.NodeRouteGenerationMatched {
if age > fabricServiceChannelRebuildTransitionDeadline {
item.GuardStatus = "missing_route_generation"
item.GuardSeverity = "bad"
item.GuardReason = "node_transition_seen_but_route_generation_not_correlated"
return item
}
item.GuardStatus = "pending_route_generation"
item.GuardSeverity = "warn"
item.GuardReason = "waiting_for_route_generation_correlation"
return item
}
if item.PostRebuildSelectedRouteID == "" {
if age > fabricServiceChannelRebuildTrafficDeadline {
item.GuardStatus = "missing_post_rebuild_traffic"
item.GuardSeverity = "bad"
item.GuardReason = "no_post_rebuild_traffic_observed"
return item
}
item.GuardStatus = "pending_post_rebuild_traffic"
item.GuardSeverity = "warn"
item.GuardReason = "waiting_for_post_rebuild_traffic"
return item
}
if item.ReplacementRouteID != "" && item.PostRebuildSelectedRouteID != item.ReplacementRouteID {
item.GuardStatus = "unexpected_post_rebuild_route"
item.GuardSeverity = "bad"
item.GuardReason = "post_rebuild_selected_route_differs_from_replacement"
return item
}
if item.PostRebuildSendFailures > 0 || item.PostRebuildSendFlowDropped > 0 {
item.GuardStatus = "post_rebuild_degraded"
item.GuardSeverity = "warn"
item.GuardReason = "post_rebuild_traffic_has_failures_or_drops"
return item
}
item.GuardStatus = "ok"
item.GuardSeverity = "good"
item.GuardReason = "backend_decision_node_transition_and_post_rebuild_traffic_correlated"
return item
}
func sortedStringSetKeys(values map[string]struct{}) []string {
if len(values) == 0 {
return nil
}
out := make([]string, 0, len(values))
for value := range values {
out = append(out, value)
}
sort.Strings(out)
return out
}
func applyFabricServiceChannelRouteRebuildAlertSilences(items []FabricServiceChannelRouteRebuildAttempt, silences []FabricServiceChannelRouteRebuildAlertSilence) []FabricServiceChannelRouteRebuildAttempt {
if len(items) == 0 || len(silences) == 0 {
return items
}
byKey := map[string]FabricServiceChannelRouteRebuildAlertSilence{}
for _, silence := range silences {
byKey[fabricServiceChannelRebuildAlertSilenceKey(silence.ReporterNodeID, silence.RouteID, silence.GuardStatus, silence.Generation)] = silence
}
for idx := range items {
item := &items[idx]
silence, ok := byKey[fabricServiceChannelRebuildAlertSilenceKey(item.ReporterNodeID, item.RouteID, item.GuardStatus, item.Generation)]
if !ok {
continue
}
item.AlertSilenced = true
item.AlertSilenceID = silence.ID
item.AlertSilenceReason = silence.Reason
item.AlertSilencedUntil = &silence.ExpiresAt
}
byResurfaceKey := map[string]FabricServiceChannelRouteRebuildAlertSilence{}
for _, silence := range silences {
key := fabricServiceChannelRebuildAlertResurfaceKey(silence.ReporterNodeID, silence.RouteID, silence.GuardStatus)
current, ok := byResurfaceKey[key]
if !ok || silence.CreatedAt.After(current.CreatedAt) {
byResurfaceKey[key] = silence
}
}
for idx := range items {
item := &items[idx]
if item.AlertSilenced || (item.GuardSeverity != "bad" && item.GuardSeverity != "warn") {
continue
}
silence, ok := byResurfaceKey[fabricServiceChannelRebuildAlertResurfaceKey(item.ReporterNodeID, item.RouteID, item.GuardStatus)]
if !ok || strings.TrimSpace(silence.Generation) == strings.TrimSpace(item.Generation) {
continue
}
item.AlertResurfaced = true
item.AlertResurfacedFromSilenceID = silence.ID
item.AlertResurfacedPreviousGeneration = silence.Generation
item.AlertResurfacedPreviousUntil = &silence.ExpiresAt
}
return items
}
func fabricServiceChannelRebuildAlertSilenceKey(reporterNodeID, routeID, guardStatus, generation string) string {
return strings.TrimSpace(reporterNodeID) + "|" + strings.TrimSpace(routeID) + "|" + strings.TrimSpace(guardStatus) + "|" + strings.TrimSpace(generation)
}
func fabricServiceChannelRebuildAlertResurfaceKey(reporterNodeID, routeID, guardStatus string) string {
return strings.TrimSpace(reporterNodeID) + "|" + strings.TrimSpace(routeID) + "|" + strings.TrimSpace(guardStatus)
}
func fabricServiceChannelReadinessFromRebuildHealth(summary FabricServiceChannelRouteRebuildHealthSummary) FabricServiceChannelReadiness {
readiness := FabricServiceChannelReadiness{
ClusterID: summary.ClusterID,
ObservedAt: summary.ObservedAt,
Status: "clean",
Reason: "no_active_service_channel_rebuild_alerts",
ActiveAlertCount: summary.ActiveBadCount + summary.ActiveWarnCount,
ActiveBadCount: summary.ActiveBadCount,
ActiveWarnCount: summary.ActiveWarnCount,
ResurfacedCount: summary.ResurfacedCount,
SilencedCount: summary.SilencedCount,
MissingTransitionCount: summary.CountsByGuardStatus["missing_node_transition"],
MissingRouteGenerationCount: summary.CountsByGuardStatus["missing_route_generation"],
MissingPostTrafficCount: summary.CountsByGuardStatus["missing_post_rebuild_traffic"],
UnexpectedRouteCount: summary.CountsByGuardStatus["unexpected_post_rebuild_route"],
PostRebuildDegradedCount: summary.CountsByGuardStatus["post_rebuild_degraded"],
RecommendedOperatorAction: summary.RecommendedOperatorAction,
}
if summary.ResurfacedCount > 0 {
readiness.BlockingReasons = append(readiness.BlockingReasons, "resurfaced_rebuild_alert")
}
if summary.ActiveBadCount > 0 {
readiness.BlockingReasons = append(readiness.BlockingReasons, "active_bad_rebuild_alert")
}
if readiness.MissingTransitionCount > 0 {
readiness.BlockingReasons = append(readiness.BlockingReasons, "missing_node_transition")
}
if readiness.MissingRouteGenerationCount > 0 {
readiness.BlockingReasons = append(readiness.BlockingReasons, "missing_route_generation")
}
if readiness.MissingPostTrafficCount > 0 {
readiness.BlockingReasons = append(readiness.BlockingReasons, "missing_post_rebuild_traffic")
}
if readiness.UnexpectedRouteCount > 0 {
readiness.BlockingReasons = append(readiness.BlockingReasons, "unexpected_post_rebuild_route")
}
if readiness.PostRebuildDegradedCount > 0 {
readiness.DegradedReasons = append(readiness.DegradedReasons, "post_rebuild_degraded")
}
if summary.ActiveWarnCount > 0 {
readiness.DegradedReasons = append(readiness.DegradedReasons, "active_warn_rebuild_alert")
}
if summary.PendingCount > 0 {
readiness.DegradedReasons = append(readiness.DegradedReasons, "pending_rebuild_attempt")
}
if summary.SilencedCount > 0 {
readiness.DegradedReasons = append(readiness.DegradedReasons, "silenced_alert_under_observation")
}
if len(readiness.BlockingReasons) > 0 {
readiness.Status = "blocked"
readiness.Reason = readiness.BlockingReasons[0]
return readiness
}
if len(readiness.DegradedReasons) > 0 {
readiness.Status = "degraded"
readiness.Reason = readiness.DegradedReasons[0]
}
return readiness
}
func fabricServiceChannelRebuildRecommendedAction(summary FabricServiceChannelRouteRebuildHealthSummary) string {
if summary.AccessNoSafeCount > 0 {
return "inspect_access_no_safe_recovery_route_pool_and_signed_policy"
}
if summary.ActiveBadCount > 0 {
if summary.ResurfacedCount > 0 {
return "resurfaced_rebuild_alerts_need_reinspection_new_generation_or_route_changed"
}
return "inspect_bad_rebuild_attempts_check_reporter_node_heartbeats_route_generation_and_post_rebuild_traffic"
}
if summary.ActiveWarnCount > 0 {
return "watch_pending_rebuild_attempts_until_node_transition_and_post_rebuild_traffic_arrive"
}
if summary.SilencedCount > 0 {
return "no_active_rebuild_alerts_silenced_alerts_remain_under_observation"
}
if summary.TotalAttempts == 0 {
return "no_rebuild_attempts_observed"
}
return "no_operator_action_required"
}
func fabricServiceChannelRouteRebuildIncidentsFromAttempts(clusterID string, items []FabricServiceChannelRouteRebuildAttempt) []FabricServiceChannelRouteRebuildIncident {
byKey := map[string]*FabricServiceChannelRouteRebuildIncident{}
for _, item := range items {
guardStatus := firstNonEmptyString(item.GuardStatus, "unknown")
guardSeverity := firstNonEmptyString(item.GuardSeverity, "unknown")
key := strings.Join([]string{item.ReporterNodeID, item.RouteID, item.ServiceClass, item.Generation, guardStatus}, "|")
incident, ok := byKey[key]
if !ok {
fingerprint := hashStringHex(key)
incident = &FabricServiceChannelRouteRebuildIncident{
Fingerprint: fingerprint,
ClusterID: clusterID,
ReporterNodeID: item.ReporterNodeID,
RouteID: item.RouteID,
ServiceClass: item.ServiceClass,
Generation: item.Generation,
GuardStatus: guardStatus,
GuardSeverity: guardSeverity,
GuardReason: item.GuardReason,
FirstSeenAt: item.CreatedAt,
LastSeenAt: item.UpdatedAt,
LatestReplacementRouteID: item.ReplacementRouteID,
LatestRebuildStatus: item.RebuildStatus,
LatestOutcome: item.Outcome,
AlertSilenced: item.AlertSilenced,
AlertResurfaced: item.AlertResurfaced,
}
byKey[key] = incident
}
incident.AttemptCount++
if item.CreatedAt.Before(incident.FirstSeenAt) {
incident.FirstSeenAt = item.CreatedAt
}
if item.UpdatedAt.After(incident.LastSeenAt) {
incident.LastSeenAt = item.UpdatedAt
incident.GuardSeverity = guardSeverity
incident.GuardReason = item.GuardReason
incident.LatestReplacementRouteID = item.ReplacementRouteID
incident.LatestRebuildStatus = item.RebuildStatus
incident.LatestOutcome = item.Outcome
}
incident.AlertSilenced = incident.AlertSilenced || item.AlertSilenced
if item.AlertResurfaced {
incident.AlertResurfaced = true
incident.AlertResurfacedFromSilenceID = item.AlertResurfacedFromSilenceID
incident.AlertResurfacedCause = item.AlertResurfacedCause
incident.AlertResurfacedPreviousRouteID = item.AlertResurfacedPreviousRouteID
incident.AlertResurfacedPreviousChannelID = item.AlertResurfacedPreviousChannelID
incident.AlertResurfacedPreviousGeneration = item.AlertResurfacedPreviousGeneration
incident.AlertResurfacedPreviousUntil = item.AlertResurfacedPreviousUntil
}
}
out := make([]FabricServiceChannelRouteRebuildIncident, 0, len(byKey))
for _, incident := range byKey {
incident.RecommendedOperatorAction = fabricServiceChannelRebuildIncidentRecommendedAction(*incident)
out = append(out, *incident)
}
for idx := range out {
out[idx].RecommendedOperatorAction = fabricServiceChannelRebuildIncidentRecommendedAction(out[idx])
}
fabricServiceChannelSortRouteRebuildIncidents(out)
return out
}
func fabricServiceChannelSortRouteRebuildIncidents(out []FabricServiceChannelRouteRebuildIncident) {
sort.SliceStable(out, func(i, j int) bool {
leftRank := fabricServiceChannelRebuildIncidentSeverityRank(out[i])
rightRank := fabricServiceChannelRebuildIncidentSeverityRank(out[j])
if leftRank != rightRank {
return leftRank > rightRank
}
return out[i].LastSeenAt.After(out[j].LastSeenAt)
})
}
func fabricServiceChannelAccessDecisionIncidents(clusterID string, telemetry FabricServiceChannelAccessTelemetry) []FabricServiceChannelRouteRebuildIncident {
out := []FabricServiceChannelRouteRebuildIncident{}
for _, channel := range telemetry.ActiveChannels {
if channel.RouteDecisionSource == "" {
continue
}
status, severity, reason := fabricServiceChannelAccessDecisionIncidentState(channel)
if status == "" {
continue
}
key := strings.Join([]string{"access_decision", channel.ChannelID, channel.RouteDecisionRouteID, status, channel.RouteDecisionGeneration}, "|")
out = append(out, FabricServiceChannelRouteRebuildIncident{
Fingerprint: hashStringHex(key),
ClusterID: clusterID,
ReporterNodeID: channel.SelectedEntryNodeID,
RouteID: firstNonEmptyString(channel.RouteDecisionRouteID, channel.PrimaryRouteID),
ServiceClass: channel.ServiceClass,
Generation: channel.RouteDecisionGeneration,
IncidentSource: "access_decision",
ChannelID: channel.ChannelID,
GuardStatus: status,
GuardSeverity: severity,
GuardReason: reason,
AttemptCount: 1,
FirstSeenAt: telemetry.ObservedAt,
LastSeenAt: telemetry.ObservedAt,
LatestReplacementRouteID: channel.RouteDecisionReplacementRouteID,
LatestRebuildStatus: channel.RouteDecisionRebuildStatus,
LatestOutcome: channel.RouteDecisionSource,
})
}
for idx := range out {
out[idx].RecommendedOperatorAction = fabricServiceChannelRebuildIncidentRecommendedAction(out[idx])
}
fabricServiceChannelSortRouteRebuildIncidents(out)
return out
}
func fabricServiceChannelDataPlaneContractIncidents(clusterID string, telemetry FabricServiceChannelAccessTelemetry) []FabricServiceChannelRouteRebuildIncident {
out := []FabricServiceChannelRouteRebuildIncident{}
for _, channel := range telemetry.ActiveChannels {
status, severity, reason := fabricServiceChannelDataPlaneContractIncidentState(channel)
if status == "" {
continue
}
routeID := firstNonEmptyString(channel.RouteDecisionRouteID, channel.PrimaryRouteID, "data_plane")
generation := firstNonEmptyString(channel.RouteDecisionGeneration, channel.PrimaryRouteID, channel.DataPlane.BackendRelayPolicy, channel.ChannelID)
key := strings.Join([]string{"data_plane_contract", channel.ChannelID, routeID, status, generation}, "|")
out = append(out, FabricServiceChannelRouteRebuildIncident{
Fingerprint: hashStringHex(key),
ClusterID: clusterID,
ReporterNodeID: channel.SelectedEntryNodeID,
RouteID: routeID,
ServiceClass: channel.ServiceClass,
Generation: generation,
IncidentSource: "data_plane_contract",
ChannelID: channel.ChannelID,
GuardStatus: status,
GuardSeverity: severity,
GuardReason: reason,
AttemptCount: 1,
FirstSeenAt: telemetry.ObservedAt,
LastSeenAt: telemetry.ObservedAt,
LatestOutcome: firstNonEmptyString(channel.EntryNodeLastWorkingDataTransport, channel.DataPlane.WorkingDataTransport, "unknown"),
LatestRebuildStatus: firstNonEmptyString(
channel.EntryNodeLastBackendRelayPolicy,
channel.DataPlane.BackendRelayPolicy,
),
})
}
for idx := range out {
out[idx].RecommendedOperatorAction = fabricServiceChannelRebuildIncidentRecommendedAction(out[idx])
}
fabricServiceChannelSortRouteRebuildIncidents(out)
return out
}
func applyFabricServiceChannelAccessDecisionIncidentSilences(items []FabricServiceChannelRouteRebuildIncident, silences []FabricServiceChannelRouteRebuildAlertSilence) []FabricServiceChannelRouteRebuildIncident {
if len(items) == 0 || len(silences) == 0 {
return items
}
byKey := map[string]FabricServiceChannelRouteRebuildAlertSilence{}
byResurfaceKey := map[string]FabricServiceChannelRouteRebuildAlertSilence{}
byGeneralResurfaceKey := map[string]FabricServiceChannelRouteRebuildAlertSilence{}
byAccessReporterGuard := map[string]FabricServiceChannelRouteRebuildAlertSilence{}
for _, silence := range silences {
byKey[fabricServiceChannelRebuildAlertSilenceKey(silence.ReporterNodeID, silence.RouteID, silence.GuardStatus, silence.Generation)] = silence
resurfaceKey := fabricServiceChannelRebuildAlertResurfaceKey(silence.ReporterNodeID, silence.RouteID, silence.GuardStatus)
current, ok := byResurfaceKey[resurfaceKey]
if !ok || silence.CreatedAt.After(current.CreatedAt) {
byResurfaceKey[resurfaceKey] = silence
}
if channelID, routeID, ok := fabricServiceChannelParseAccessDecisionSilenceRouteID(silence.RouteID); ok {
_ = channelID
generalKey := fabricServiceChannelRebuildAlertResurfaceKey(silence.ReporterNodeID, routeID, silence.GuardStatus)
current, ok := byGeneralResurfaceKey[generalKey]
if !ok || silence.CreatedAt.After(current.CreatedAt) {
byGeneralResurfaceKey[generalKey] = silence
}
accessKey := fabricServiceChannelRebuildAlertResurfaceKey(silence.ReporterNodeID, "access_decision", silence.GuardStatus)
current, ok = byAccessReporterGuard[accessKey]
if !ok || silence.CreatedAt.After(current.CreatedAt) {
byAccessReporterGuard[accessKey] = silence
}
}
}
for idx := range items {
item := &items[idx]
silenceRouteID := fabricServiceChannelAccessDecisionSilenceRouteID(item.ChannelID, item.RouteID)
silence, ok := byKey[fabricServiceChannelRebuildAlertSilenceKey(item.ReporterNodeID, silenceRouteID, item.GuardStatus, item.Generation)]
if ok {
item.AlertSilenced = true
continue
}
if item.GuardSeverity != "bad" && item.GuardSeverity != "warn" {
continue
}
silence, ok = byResurfaceKey[fabricServiceChannelRebuildAlertResurfaceKey(item.ReporterNodeID, silenceRouteID, item.GuardStatus)]
if !ok || strings.TrimSpace(silence.Generation) == strings.TrimSpace(item.Generation) {
generalSilence, generalOK := byGeneralResurfaceKey[fabricServiceChannelRebuildAlertResurfaceKey(item.ReporterNodeID, item.RouteID, item.GuardStatus)]
if !generalOK || strings.TrimSpace(generalSilence.Generation) == strings.TrimSpace(item.Generation) {
accessSilence, accessOK := byAccessReporterGuard[fabricServiceChannelRebuildAlertResurfaceKey(item.ReporterNodeID, "access_decision", item.GuardStatus)]
if !accessOK || !fabricServiceChannelAccessDecisionSilenceDiffers(*item, accessSilence) {
continue
}
generalSilence = accessSilence
}
silence = generalSilence
}
item.AlertResurfaced = true
item.AlertResurfacedFromSilenceID = silence.ID
item.AlertResurfacedCause = fabricServiceChannelAccessDecisionResurfaceCause(*item, silence)
item.AlertResurfacedPreviousRouteID = silence.DisplayRouteID
item.AlertResurfacedPreviousChannelID = silence.ChannelID
item.AlertResurfacedPreviousGeneration = silence.Generation
item.AlertResurfacedPreviousUntil = &silence.ExpiresAt
}
return items
}
func fabricServiceChannelAccessDecisionSilenceDiffers(item FabricServiceChannelRouteRebuildIncident, silence FabricServiceChannelRouteRebuildAlertSilence) bool {
return strings.TrimSpace(silence.ChannelID) != strings.TrimSpace(item.ChannelID) ||
strings.TrimSpace(silence.DisplayRouteID) != strings.TrimSpace(item.RouteID) ||
strings.TrimSpace(silence.Generation) != strings.TrimSpace(item.Generation)
}
func fabricServiceChannelAccessDecisionResurfaceCause(item FabricServiceChannelRouteRebuildIncident, silence FabricServiceChannelRouteRebuildAlertSilence) string {
if strings.TrimSpace(silence.ChannelID) != "" && strings.TrimSpace(silence.ChannelID) != strings.TrimSpace(item.ChannelID) {
return "channel_changed"
}
if strings.TrimSpace(silence.DisplayRouteID) != "" && strings.TrimSpace(silence.DisplayRouteID) != strings.TrimSpace(item.RouteID) {
return "route_changed"
}
if strings.TrimSpace(silence.Generation) != strings.TrimSpace(item.Generation) {
return "generation_changed"
}
return "resurfaced"
}
func fabricServiceChannelAccessDecisionSilenceRouteID(channelID string, routeID string) string {
return "access:" + strings.TrimSpace(channelID) + ":" + strings.TrimSpace(routeID)
}
func fabricServiceChannelParseAccessDecisionSilenceRouteID(value string) (string, string, bool) {
value = strings.TrimSpace(value)
if !strings.HasPrefix(value, "access:") {
return "", "", false
}
rest := strings.TrimPrefix(value, "access:")
parts := strings.SplitN(rest, ":", 2)
if len(parts) != 2 || strings.TrimSpace(parts[0]) == "" || strings.TrimSpace(parts[1]) == "" {
return "", "", false
}
return strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1]), true
}
func fabricServiceChannelAccessDecisionIncidentState(channel FabricServiceChannelAccessTelemetryChannel) (string, string, string) {
switch {
case fabricServiceChannelRouteDecisionIsNoSafeRecovery(channel):
return "access_no_safe_recovery", "bad", firstNonEmptyString(channel.RouteDecisionRebuildReason, "no_unfenced_alternate_route")
case fabricServiceChannelRouteDecisionIsRecovery(channel):
return "access_recovery_selected", "warn", firstNonEmptyString(channel.RouteDecisionRebuildReason, "recovery_route_selected")
case channel.RouteDecisionRebuildStatus == "applied" || containsString(channel.RouteDecisionScoreReasons, "service_channel_rebuild_applied"):
return "access_rebuild_applied", "good", firstNonEmptyString(channel.RouteDecisionRebuildReason, "planner_applied_rebuild")
case fabricServiceChannelRouteDecisionIsReplacement(channel):
return "access_replacement_selected", "warn", firstNonEmptyString(channel.RouteDecisionRebuildReason, "replacement_route_selected")
default:
return "", "", ""
}
}
func fabricServiceChannelDataPlaneContractIncidentState(channel FabricServiceChannelAccessTelemetryChannel) (string, string, string) {
accepted := channel.EntryNodeTotalAccepted > 0 || channel.EntryNodeIntrospectionAccepted > 0 || channel.EntryNodeBackendFallbackCount > 0
if accepted && channel.EntryNodeDataPlaneContractCount == 0 {
return "data_plane_contract_not_reported", "bad", "entry_node_accepted_service_channel_without_reporting_data_plane_contract"
}
workingTransport := firstNonEmptyString(channel.EntryNodeLastWorkingDataTransport, channel.DataPlane.WorkingDataTransport)
if workingTransport != "" && workingTransport != "fabric_service_channel" {
return "data_plane_working_transport_violation", "bad", "working_data_transport_must_be_fabric_service_channel"
}
steadyTransport := firstNonEmptyString(channel.EntryNodeLastSteadyStateTransport, channel.DataPlane.SteadyStateTransport)
if steadyTransport != "" && steadyTransport != "fabric_route" {
return "data_plane_steady_state_transport_violation", "bad", "steady_state_transport_must_be_fabric_route"
}
logicalFlowMode := firstNonEmptyString(channel.EntryNodeLastLogicalFlowMode, channel.DataPlane.LogicalFlowMode)
if logicalFlowMode != "" && logicalFlowMode != "multi_flow_isolated" {
return "data_plane_logical_flow_violation", "bad", "logical_flow_mode_must_be_multi_flow_isolated"
}
backendRelayPolicy := firstNonEmptyString(channel.EntryNodeLastBackendRelayPolicy, channel.DataPlane.BackendRelayPolicy)
if channel.EntryNodeBackendFallbackBlockedCount > 0 {
return firstNonEmptyString(channel.EntryNodeLastDataPlaneViolationStatus, "data_plane_backend_fallback_blocked"), "bad", firstNonEmptyString(channel.EntryNodeLastDataPlaneViolationReason, "backend_fallback_blocked_by_data_plane_policy")
}
if channel.EntryNodeFabricRouteSendFailureCount > 0 {
return firstNonEmptyString(channel.EntryNodeLastDataPlaneViolationStatus, "data_plane_fabric_route_send_failed"), "bad", firstNonEmptyString(channel.EntryNodeLastDataPlaneViolationReason, "fabric_route_send_failed")
}
if backendRelayPolicy == "disabled" && (channel.EntryNodeBackendFallbackCount > 0 || channel.ForceBackendFallback) {
return "data_plane_disabled_backend_relay_observed", "bad", "backend_relay_policy_disabled_but_backend_fallback_was_observed"
}
if backendRelayPolicy == "degraded_fallback_only" && channel.EntryNodeBackendFallbackCount > 0 {
return "data_plane_degraded_backend_relay_observed", "warn", "backend_relay_used_as_degraded_fallback_for_working_data"
}
return "", "", ""
}
func hashStringHex(value string) string {
sum := sha256.Sum256([]byte(value))
return hex.EncodeToString(sum[:])
}
func fabricServiceChannelRebuildIncidentSeverityRank(item FabricServiceChannelRouteRebuildIncident) int {
if item.AlertResurfaced {
return 4
}
if item.IncidentSource == "access_decision" && item.GuardStatus == "access_no_safe_recovery" {
return 4
}
switch item.GuardSeverity {
case "bad":
return 3
case "warn":
return 2
case "good":
return 1
default:
return 0
}
}
func fabricServiceChannelRebuildIncidentRecommendedAction(item FabricServiceChannelRouteRebuildIncident) string {
if item.AlertSilenced && !item.AlertResurfaced {
return "silenced_rebuild_incident_under_observation"
}
if item.AlertResurfaced {
return "open_deep_ledger_for_resurfaced_generation"
}
if item.IncidentSource == "access_decision" {
switch item.GuardStatus {
case "access_no_safe_recovery":
return "inspect_access_no_safe_recovery_route_pool_and_signed_policy"
case "access_recovery_selected":
return "watch_recovery_route_quality_and_confirm_post_recovery_traffic"
case "access_rebuild_applied":
return "confirm_applied_rebuild_runtime_traffic_stays_on_replacement"
case "access_replacement_selected":
return "watch_replacement_route_quality_until_applied_or_recovered"
}
}
if item.IncidentSource == "data_plane_contract" {
switch item.GuardStatus {
case "data_plane_contract_not_reported":
return "upgrade_or_restart_entry_node_until_data_plane_contract_is_reported"
case "data_plane_working_transport_violation", "data_plane_steady_state_transport_violation", "data_plane_logical_flow_violation":
return "inspect_signed_data_plane_contract_and_node_agent_runtime_path"
case "data_plane_disabled_backend_relay_observed":
return "stop_backend_relay_usage_and_restore_fabric_route_before_service_traffic"
case "data_plane_degraded_backend_relay_observed":
return "restore_fabric_route_and_treat_backend_relay_as_degraded_only"
case "backend_fallback_blocked_by_policy", "fabric_route_send_failed_backend_fallback_blocked", "data_plane_backend_fallback_blocked":
return "restore_fabric_route_or_change_signed_backend_relay_policy_before_retry"
case "data_plane_fabric_route_send_failed":
return "inspect_entry_route_runtime_and_restore_fabric_route_delivery"
}
}
switch item.GuardStatus {
case "missing_node_transition":
return "open_deep_ledger_check_reporter_heartbeats_and_route_manager_transition"
case "missing_route_generation":
return "open_deep_ledger_check_route_generation_apply_or_withdraw"
case "missing_post_rebuild_traffic":
return "open_deep_ledger_check_post_rebuild_traffic_and_selected_route"
case "unexpected_post_rebuild_route":
return "open_deep_ledger_check_selected_route_vs_replacement"
case "post_rebuild_degraded":
return "inspect_post_rebuild_drops_failures_and_route_quality"
case "ok":
return "no_operator_action_required"
default:
if item.GuardSeverity == "bad" || item.GuardSeverity == "warn" {
return "open_deep_ledger_for_rebuild_incident"
}
return "no_operator_action_required"
}
}
func transitionMatchesRebuildAttempt(transition map[string]any, item FabricServiceChannelRouteRebuildAttempt) bool {
if len(transition) == 0 {
return false
}
generation := jsonString(transition, "generation")
if item.Generation != "" {
return generation != "" && generation == item.Generation
}
status := jsonString(transition, "status")
return (status == "applied_rebuild" && item.RebuildStatus == "applied") ||
(status == "pending_degraded_fallback" && item.RebuildStatus == "pending_degraded_fallback")
}
func routeGenerationDecisionForAttempt(report map[string]any, item FabricServiceChannelRouteRebuildAttempt) (map[string]any, bool) {
for _, key := range []string{"active_decisions", "withdrawn_decisions"} {
for _, raw := range jsonArray(report, key) {
decision, ok := raw.(map[string]any)
if !ok {
continue
}
if jsonString(decision, "route_id") != item.RouteID {
continue
}
generation := jsonString(decision, "generation")
if item.Generation == "" || generation == "" || generation == item.Generation {
return decision, true
}
}
}
return nil, false
}
func jsonObject(raw json.RawMessage) map[string]any {
if len(raw) == 0 || !json.Valid(raw) {
return map[string]any{}
}
var out map[string]any
if err := json.Unmarshal(raw, &out); err != nil {
return map[string]any{}
}
return out
}
func jsonMapPath(raw map[string]any, path ...string) map[string]any {
current := raw
for _, key := range path {
next, ok := current[key].(map[string]any)
if !ok {
return map[string]any{}
}
current = next
}
return current
}
func jsonArray(raw map[string]any, key string) []any {
if raw == nil {
return nil
}
items, _ := raw[key].([]any)
return items
}
func jsonString(raw map[string]any, key string) string {
if raw == nil {
return ""
}
value, _ := raw[key].(string)
return strings.TrimSpace(value)
}
func jsonStringArray(raw map[string]any, key string) []string {
items := jsonArray(raw, key)
if len(items) == 0 {
return nil
}
out := make([]string, 0, len(items))
for _, item := range items {
value, ok := item.(string)
if !ok {
continue
}
value = strings.TrimSpace(value)
if value != "" {
out = append(out, value)
}
}
return out
}
func jsonInt(raw map[string]any, key string) int {
if raw == nil {
return 0
}
switch value := raw[key].(type) {
case float64:
return int(value)
case int:
return value
case int64:
return int(value)
case json.Number:
parsed, _ := value.Int64()
return int(parsed)
default:
return 0
}
}
func jsonBool(raw map[string]any, key string) bool {
if raw == nil {
return false
}
value, _ := raw[key].(bool)
return value
}
func jsonStringIntMap(raw map[string]any, key string) map[string]int {
if raw == nil {
return nil
}
values, ok := raw[key].(map[string]any)
if !ok || len(values) == 0 {
return nil
}
out := make(map[string]int, len(values))
for name, value := range values {
name = strings.TrimSpace(name)
if name == "" {
continue
}
switch typed := value.(type) {
case float64:
out[name] = int(typed)
case int:
out[name] = typed
case int64:
out[name] = int(typed)
case json.Number:
parsed, _ := typed.Int64()
out[name] = int(parsed)
}
}
if len(out) == 0 {
return nil
}
return out
}
func copyStringIntMap(values map[string]int) map[string]int {
if len(values) == 0 {
return nil
}
out := make(map[string]int, len(values))
for key, value := range values {
out[key] = value
}
return out
}
func mergeStringIntMap(target map[string]int, source map[string]int) {
if target == nil || len(source) == 0 {
return
}
for key, value := range source {
target[key] += value
}
}
func mergeMinStringIntMap(target map[string]int, source map[string]int) {
if target == nil || len(source) == 0 {
return
}
for key, value := range source {
if strings.TrimSpace(key) == "" || value <= 0 {
continue
}
current, ok := target[key]
if !ok || value < current {
target[key] = value
}
}
}
func jsonUint64(raw map[string]any, key string) uint64 {
if raw == nil {
return 0
}
switch value := raw[key].(type) {
case float64:
if value > 0 {
return uint64(value)
}
case int:
if value > 0 {
return uint64(value)
}
case int64:
if value > 0 {
return uint64(value)
}
case uint64:
return value
}
return 0
}
func (s *Service) ExpireFabricServiceChannelRouteFeedback(ctx context.Context, input ExpireFabricServiceChannelRouteFeedbackInput) (ExpireFabricServiceChannelRouteFeedbackResult, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return ExpireFabricServiceChannelRouteFeedbackResult{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return ExpireFabricServiceChannelRouteFeedbackResult{}, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.ReporterNodeID = strings.TrimSpace(input.ReporterNodeID)
input.RouteID = strings.TrimSpace(input.RouteID)
input.ServiceClass = strings.TrimSpace(input.ServiceClass)
input.Reason = strings.TrimSpace(input.Reason)
if input.ClusterID == "" || input.RouteID == "" {
return ExpireFabricServiceChannelRouteFeedbackResult{}, ErrInvalidPayload
}
if input.Now.IsZero() {
input.Now = s.now()
}
result, err := s.store.ExpireFabricServiceChannelRouteFeedback(ctx, input)
if err != nil {
return ExpireFabricServiceChannelRouteFeedbackResult{}, err
}
payload, _ := json.Marshal(map[string]any{
"reporter_node_id": input.ReporterNodeID,
"route_id": input.RouteID,
"service_class": input.ServiceClass,
"reason": input.Reason,
"expired_count": result.ExpiredCount,
"expired_at": result.ExpiredAt,
"cooldown_until": result.CooldownUntil,
})
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "fabric.service_channel_route_feedback.expired",
TargetType: "fabric_service_channel_route",
TargetID: &input.RouteID,
Payload: payload,
CreatedAt: input.Now.UTC(),
})
return result, nil
}
func (s *Service) CreateReleaseVersion(ctx context.Context, input CreateReleaseVersionInput) (ReleaseVersion, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return ReleaseVersion{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return ReleaseVersion{}, err
}
input.Product = normalizeUpdateToken(input.Product)
input.Version = strings.TrimSpace(input.Version)
input.Channel = normalizeUpdateToken(firstNonEmptyString(input.Channel, "dev"))
input.Status = normalizeUpdateToken(firstNonEmptyString(input.Status, "active"))
if input.ClusterID == "" || input.Product == "" || input.Version == "" || len(input.Artifacts) == 0 {
return ReleaseVersion{}, ErrInvalidPayload
}
if input.Status != "active" && input.Status != "draft" && input.Status != "revoked" {
return ReleaseVersion{}, ErrInvalidPayload
}
input.Compatibility = defaultJSON(input.Compatibility, `{}`)
if !json.Valid(input.Compatibility) {
return ReleaseVersion{}, ErrInvalidPayload
}
for i := range input.Artifacts {
input.Artifacts[i].OS = normalizeUpdateToken(input.Artifacts[i].OS)
input.Artifacts[i].Arch = normalizeUpdateToken(input.Artifacts[i].Arch)
input.Artifacts[i].InstallType = normalizeUpdateToken(input.Artifacts[i].InstallType)
input.Artifacts[i].Kind = normalizeUpdateToken(input.Artifacts[i].Kind)
input.Artifacts[i].URL = strings.TrimSpace(input.Artifacts[i].URL)
input.Artifacts[i].SHA256 = strings.TrimSpace(input.Artifacts[i].SHA256)
input.Artifacts[i].Metadata = defaultJSON(input.Artifacts[i].Metadata, `{}`)
if input.Artifacts[i].OS == "" || input.Artifacts[i].Arch == "" || input.Artifacts[i].InstallType == "" ||
input.Artifacts[i].Kind == "" || input.Artifacts[i].URL == "" || input.Artifacts[i].SHA256 == "" ||
!json.Valid(input.Artifacts[i].Metadata) {
return ReleaseVersion{}, ErrInvalidPayload
}
}
item, err := s.store.CreateReleaseVersion(ctx, input)
if err != nil {
return ReleaseVersion{}, err
}
item, err = s.signReleaseVersion(ctx, item, &input.ActorUserID)
if err != nil {
return ReleaseVersion{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "release_version.created",
TargetType: "release_version",
TargetID: &item.ID,
Payload: json.RawMessage(`{"production_forwarding":false}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) ListReleaseVersions(ctx context.Context, actorUserID, clusterID, product, channel string) ([]ReleaseVersion, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListReleaseVersions(ctx, clusterID, normalizeUpdateToken(product), normalizeUpdateToken(channel))
}
func (s *Service) UpsertNodeUpdatePolicy(ctx context.Context, input UpsertNodeUpdatePolicyInput) (NodeUpdatePolicy, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return NodeUpdatePolicy{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return NodeUpdatePolicy{}, err
}
input.Product = normalizeUpdateToken(input.Product)
input.Channel = normalizeUpdateToken(firstNonEmptyString(input.Channel, "dev"))
input.Strategy = normalizeUpdateToken(firstNonEmptyString(input.Strategy, "manual"))
if input.ClusterID == "" || input.NodeID == "" || input.Product == "" {
return NodeUpdatePolicy{}, ErrInvalidPayload
}
switch input.Strategy {
case "manual", "canary", "rolling", "pinned":
default:
return NodeUpdatePolicy{}, ErrInvalidPayload
}
if input.HealthWindowSec <= 0 {
input.HealthWindowSec = 180
}
if input.TargetVersion != nil {
trimmed := strings.TrimSpace(*input.TargetVersion)
input.TargetVersion = &trimmed
}
item, err := s.store.UpsertNodeUpdatePolicy(ctx, input)
if err != nil {
return NodeUpdatePolicy{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "node_update_policy.updated",
TargetType: "node",
TargetID: &input.NodeID,
Payload: json.RawMessage(`{"production_forwarding":false}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) GetNodeUpdatePlan(ctx context.Context, input GetNodeUpdatePlanInput) (NodeUpdatePlan, error) {
input.Product = normalizeUpdateToken(firstNonEmptyString(input.Product, "rap-node-agent"))
input.Channel = normalizeUpdateToken(input.Channel)
input.OS = normalizeUpdateToken(input.OS)
input.Arch = normalizeUpdateToken(input.Arch)
input.InstallType = normalizeUpdateToken(input.InstallType)
input.CurrentVersion = strings.TrimSpace(input.CurrentVersion)
input.ArtifactOrigin = normalizeArtifactOrigin(input.ArtifactOrigin)
if input.ClusterID == "" || input.NodeID == "" || input.Product == "" || input.OS == "" || input.Arch == "" || input.InstallType == "" {
return NodeUpdatePlan{}, ErrInvalidPayload
}
policy, err := s.store.GetNodeUpdatePolicy(ctx, input.ClusterID, input.NodeID, input.Product)
if errors.Is(err, pgx.ErrNoRows) {
return s.signNodeUpdatePlan(ctx, NodeUpdatePlan{
SchemaVersion: "rap.node_update_plan.v1",
ClusterID: input.ClusterID,
NodeID: input.NodeID,
Product: input.Product,
CurrentVersion: input.CurrentVersion,
Action: "none",
Reason: "no_update_policy",
ProductionForwarding: false,
})
}
if err != nil {
return NodeUpdatePlan{}, err
}
if input.Channel == "" {
input.Channel = policy.Channel
}
base := NodeUpdatePlan{
SchemaVersion: "rap.node_update_plan.v1",
ClusterID: input.ClusterID,
NodeID: input.NodeID,
Product: input.Product,
CurrentVersion: input.CurrentVersion,
Channel: input.Channel,
Strategy: policy.Strategy,
RollbackAllowed: policy.RollbackAllowed,
HealthWindowSec: policy.HealthWindowSec,
ProductionForwarding: false,
}
if !policy.Enabled {
base.Action = "none"
base.Reason = "policy_disabled"
return s.signNodeUpdatePlan(ctx, base)
}
if mismatch, err := s.hostAgentPlatformMismatch(ctx, input); err != nil {
return NodeUpdatePlan{}, err
} else if mismatch {
base.Action = "none"
base.Reason = "host_agent_artifact_platform_mismatch"
return s.signNodeUpdatePlan(ctx, base)
}
releases, err := s.store.ListReleaseVersions(ctx, input.ClusterID, input.Product, input.Channel)
if err != nil {
return NodeUpdatePlan{}, err
}
release, artifact, ok := selectReleaseArtifact(releases, input, policy)
if !ok {
base.Action = "none"
base.Reason = "no_matching_artifact"
return s.signNodeUpdatePlan(ctx, base)
}
base.TargetVersion = release.Version
artifact = absolutizeReleaseArtifact(artifact, input.ArtifactOrigin)
base.Artifact = &artifact
if strings.TrimSpace(input.CurrentVersion) == release.Version {
base.Action = "none"
base.Reason = "already_current"
return s.signNodeUpdatePlan(ctx, base)
}
base.Action = "update"
base.Reason = "matching_release_available"
return s.signNodeUpdatePlan(ctx, base)
}
func (s *Service) ReportNodeUpdateStatus(ctx context.Context, input ReportNodeUpdateStatusInput) (NodeUpdateStatus, error) {
input.Product = normalizeUpdateToken(firstNonEmptyString(input.Product, "rap-node-agent"))
input.Phase = normalizeUpdateToken(input.Phase)
input.Status = normalizeUpdateToken(input.Status)
if input.ClusterID == "" || input.NodeID == "" || input.Product == "" || input.Phase == "" || input.Status == "" {
return NodeUpdateStatus{}, ErrInvalidPayload
}
input.Payload = defaultJSON(input.Payload, `{}`)
if !json.Valid(input.Payload) {
return NodeUpdateStatus{}, ErrInvalidPayload
}
if input.ObservedAt.IsZero() {
input.ObservedAt = s.now()
}
return s.store.ReportNodeUpdateStatus(ctx, input)
}
func (s *Service) ListNodeUpdateStatuses(ctx context.Context, actorUserID, clusterID, nodeID string, limit int) ([]NodeUpdateStatus, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
if clusterID == "" || nodeID == "" {
return nil, ErrInvalidPayload
}
return s.store.ListNodeUpdateStatuses(ctx, clusterID, nodeID, limit)
}
func (s *Service) GetNodeUpdateHint(ctx context.Context, clusterID, nodeID string) NodeUpdateHint {
products := []string{"rap-node-agent", "rap-host-agent"}
parts := make([]string, 0, len(products))
activeProducts := make([]string, 0, len(products))
updateService := s.selectNodeUpdateService(ctx, clusterID, nodeID)
for _, product := range products {
policy, err := s.store.GetNodeUpdatePolicy(ctx, clusterID, nodeID, product)
if err != nil || !policy.Enabled {
continue
}
targetVersion := strings.TrimSpace(updateHintTargetVersion(ctx, s, clusterID, product, policy))
if targetVersion == "" {
continue
}
activeProducts = append(activeProducts, product)
parts = append(parts, product+":"+targetVersion+":"+policy.UpdatedAt.UTC().Format(time.RFC3339Nano))
}
if len(parts) == 0 {
return NodeUpdateHint{
SchemaVersion: "rap.node_update_hint.v1",
CheckNow: false,
Reason: "no_enabled_update_policy",
DeliveryMode: "update_service_subscription",
SubscriptionStatus: "subscribed",
UpdateService: updateService,
FallbackPollSeconds: 21600,
}
}
sort.Strings(parts)
sort.Strings(activeProducts)
sum := sha256.Sum256([]byte(strings.Join(parts, "|")))
return NodeUpdateHint{
SchemaVersion: "rap.node_update_hint.v1",
Generation: hex.EncodeToString(sum[:])[:16],
CheckNow: true,
Products: activeProducts,
Reason: "enabled_update_policy",
DeliveryMode: "update_service_subscription",
SubscriptionStatus: "subscribed",
UpdateService: updateService,
FallbackPollSeconds: 21600,
}
}
func (s *Service) selectNodeUpdateService(ctx context.Context, clusterID, nodeID string) *NodeUpdateServiceAssignment {
now := s.now()
assignment := &NodeUpdateServiceAssignment{
SchemaVersion: "rap.node_update_service_assignment.v1",
Status: "control_plane_fallback",
Reason: "no_healthy_update_cache_service",
AssignedAt: now,
ExpiresAt: now.Add(2 * time.Minute),
}
candidates, err := s.store.ListNodeUpdateServiceCandidates(ctx, clusterID)
if err != nil || len(candidates) == 0 {
return assignment
}
selected := candidates[0]
for _, candidate := range candidates {
if candidate.NodeID == nodeID {
selected = candidate
break
}
}
assignment.NodeID = selected.NodeID
assignment.NodeName = selected.NodeName
assignment.Endpoint = selected.Endpoint
assignment.Region = selected.Region
assignment.Status = "assigned"
assignment.Reason = "healthy_update_cache_service"
assignment.ExpiresAt = now.Add(5 * time.Minute)
return assignment
}
func updateHintTargetVersion(ctx context.Context, s *Service, clusterID, product string, policy NodeUpdatePolicy) string {
if policy.TargetVersion != nil {
return strings.TrimSpace(*policy.TargetVersion)
}
releases, err := s.store.ListReleaseVersions(ctx, clusterID, product, policy.Channel)
if err != nil {
return ""
}
for _, release := range releases {
if release.Status == "active" && strings.TrimSpace(release.Version) != "" {
return strings.TrimSpace(release.Version)
}
}
return ""
}
func (s *Service) signReleaseVersion(ctx context.Context, item ReleaseVersion, actorUserID *string) (ReleaseVersion, error) {
authorityKey, err := s.ensureClusterAuthority(ctx, item.ClusterID, actorUserID)
if err != nil {
return ReleaseVersion{}, err
}
payload := map[string]any{
"schema_version": "rap.release_version_authority.v1",
"cluster_id": item.ClusterID,
"release_id": item.ID,
"product": item.Product,
"version": item.Version,
"channel": item.Channel,
"artifact_count": len(item.Artifacts),
"control_plane_only": true,
"production_forwarding": false,
}
rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now())
if err != nil {
return ReleaseVersion{}, err
}
item.AuthorityPayload = rawPayload
item.AuthoritySignature = &signature
return item, nil
}
func (s *Service) signNodeUpdatePlan(ctx context.Context, plan NodeUpdatePlan) (NodeUpdatePlan, error) {
authorityKey, err := s.ensureClusterAuthority(ctx, plan.ClusterID, nil)
if err != nil {
return NodeUpdatePlan{}, err
}
payload := map[string]any{
"schema_version": "rap.node_update_plan_authority.v1",
"cluster_id": plan.ClusterID,
"node_id": plan.NodeID,
"product": plan.Product,
"current_version": plan.CurrentVersion,
"action": plan.Action,
"target_version": plan.TargetVersion,
"artifact_sha256": "",
"control_plane_only": true,
"production_forwarding": false,
}
if plan.Artifact != nil {
payload["artifact_sha256"] = plan.Artifact.SHA256
payload["artifact_url"] = plan.Artifact.URL
}
rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now())
if err != nil {
return NodeUpdatePlan{}, err
}
plan.AuthorityPayload = rawPayload
plan.AuthoritySignature = &signature
return plan, nil
}
func (s *Service) UpsertFabricTestingFlag(ctx context.Context, input UpsertFabricTestingFlagInput) (FabricTestingFlag, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return FabricTestingFlag{}, err
}
input.ScopeType = strings.TrimSpace(input.ScopeType)
if input.ScopeType == "" {
return FabricTestingFlag{}, ErrInvalidPayload
}
switch input.ScopeType {
case "platform":
input.ScopeID = nil
case "organization", "node":
if input.ScopeID == nil || strings.TrimSpace(*input.ScopeID) == "" {
return FabricTestingFlag{}, ErrInvalidPayload
}
default:
return FabricTestingFlag{}, ErrInvalidPayload
}
if input.HistoryRetentionHours <= 0 {
input.HistoryRetentionHours = 24
}
input.Metadata = defaultJSON(input.Metadata, `{}`)
if !json.Valid(input.Metadata) {
return FabricTestingFlag{}, errors.New("testing flag metadata must be valid json")
}
item, err := s.store.UpsertFabricTestingFlag(ctx, input)
if err != nil {
return FabricTestingFlag{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "fabric.testing_flag.updated",
TargetType: input.ScopeType,
TargetID: input.ScopeID,
Payload: json.RawMessage(`{"runtime_mesh_enabled":false}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) ListFabricTestingFlags(ctx context.Context, actorUserID string) ([]FabricTestingFlag, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListFabricTestingFlags(ctx)
}
func (s *Service) GetEffectiveNodeTestingFlags(ctx context.Context, clusterID, nodeID string) (EffectiveNodeTestingFlags, error) {
if clusterID == "" || nodeID == "" {
return EffectiveNodeTestingFlags{}, ErrInvalidPayload
}
return s.store.GetEffectiveNodeTestingFlags(ctx, clusterID, nodeID)
}
func (s *Service) IssueFabricServiceChannelLease(ctx context.Context, input IssueFabricServiceChannelLeaseInput) (FabricServiceChannelLease, error) {
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.OrganizationID = strings.TrimSpace(input.OrganizationID)
input.UserID = strings.TrimSpace(input.UserID)
input.ResourceID = strings.TrimSpace(input.ResourceID)
input.ServiceClass = normalizeFabricServiceClass(input.ServiceClass)
input.EntryNodeIDs = dedupeStrings(input.EntryNodeIDs)
input.ExitNodeIDs = dedupeStrings(input.ExitNodeIDs)
input.PreferredEntryNodeID = strings.TrimSpace(input.PreferredEntryNodeID)
input.PreferredExitNodeID = strings.TrimSpace(input.PreferredExitNodeID)
if input.ClusterID == "" || input.OrganizationID == "" || input.UserID == "" || input.ServiceClass == "" || len(input.EntryNodeIDs) == 0 || len(input.ExitNodeIDs) == 0 {
return FabricServiceChannelLease{}, ErrInvalidPayload
}
if !isAllowedFabricServiceClass(input.ServiceClass) {
return FabricServiceChannelLease{}, ErrInvalidPayload
}
ttl := input.TTL
if ttl <= 0 {
ttl = time.Minute
}
if ttl > 5*time.Minute {
ttl = 5 * time.Minute
}
now := s.now().UTC()
expiresAt := now.Add(ttl)
routeGeneration := "fsc-" + now.Format("20060102T150405.000000000Z")
allowedChannels := normalizeFabricServiceChannels(input.AllowedChannels, input.ServiceClass)
requiredRoles := normalizeFabricRequiredRoles(input.RequiredRoles, input.ServiceClass)
cluster, err := s.store.GetCluster(ctx, input.ClusterID)
if errors.Is(err, pgx.ErrNoRows) {
return FabricServiceChannelLease{}, ErrInvalidCluster
}
if err != nil {
return FabricServiceChannelLease{}, err
}
poolPolicy := fabricServiceChannelPoolPolicyFromCluster(cluster)
entryNodeIDs := fabricServiceChannelEffectivePool(input.EntryNodeIDs, poolPolicy.EntryPoolNodeIDs)
exitNodeIDs := fabricServiceChannelEffectivePool(input.ExitNodeIDs, poolPolicy.ExitPoolNodeIDs)
if len(entryNodeIDs) == 0 || len(exitNodeIDs) == 0 {
return FabricServiceChannelLease{}, ErrInvalidPayload
}
selectedEntry := selectFabricServiceChannelPreferredNode(entryNodeIDs, firstNonEmptyString(poolPolicy.PreferredEntryNodeID, input.PreferredEntryNodeID))
selectedExit := selectFabricServiceChannelPreferredNode(exitNodeIDs, firstNonEmptyString(poolPolicy.PreferredExitNodeID, input.PreferredExitNodeID))
if selectedEntry == "" || selectedExit == "" {
return FabricServiceChannelLease{}, ErrInvalidPayload
}
intents, err := s.store.ListRouteIntents(ctx, input.ClusterID)
if err != nil {
return FabricServiceChannelLease{}, err
}
recoveryPolicy := s.fabricServiceChannelRecoveryPolicy(ctx, input.ClusterID)
routeProvenance := fabricServiceChannelRouteProvenanceFromIntents(intents)
feedback, err := s.fabricServiceChannelRouteFeedback(ctx, input.ClusterID, entryNodeIDs, now, recoveryPolicy, routeProvenance)
if err != nil {
return FabricServiceChannelLease{}, err
}
routes := fabricServiceChannelRoutesFromIntents(intents, input.ServiceClass, entryNodeIDs, exitNodeIDs, allowedChannels, routeGeneration, now, expiresAt, feedback, recoveryPolicy)
primary, alternates := selectFabricServicePrimaryRoute(routes, selectedEntry, selectedExit)
if primary.RouteID != "" && containsString(entryNodeIDs, primary.SourceNodeID) {
selectedEntry = primary.SourceNodeID
}
if primary.RouteID != "" && containsString(exitNodeIDs, primary.DestinationNodeID) {
selectedExit = primary.DestinationNodeID
}
fallback := FabricServiceChannelFallback{
Allowed: true,
Transport: "backend_relay",
BackendRelay: true,
Compatibility: true,
Reason: "compatibility_fallback_available",
}
fallback.Allowed = poolPolicy.BackendFallbackAllowed
fallback.BackendRelay = poolPolicy.BackendFallbackAllowed
status := FabricServiceChannelStatusReady
if primary.RouteID == "" {
if poolPolicy.BackendFallbackAllowed {
status = FabricServiceChannelStatusDegradedFallback
fallback.Active = true
fallback.Degraded = true
fallback.Reason = "no_authorized_fabric_route_for_selected_entry_exit"
} else {
status = "blocked_no_fabric_route"
fallback.Active = false
fallback.Degraded = true
fallback.Reason = "backend_fallback_disabled_by_pool_policy"
}
if fabricServiceRoutesFencedForSelectedPair(routes, selectedEntry, selectedExit) {
fallback.Reason = "fabric_route_rebuild_pending_backend_relay"
} else if fabricServiceRoutesFencedForPool(routes) {
fallback.Reason = "fabric_entry_exit_pool_rebuild_pending_backend_relay"
}
primary = FabricServiceChannelRoute{
ClusterID: input.ClusterID,
ServiceClass: input.ServiceClass,
SourceNodeID: selectedEntry,
DestinationNodeID: selectedExit,
Hops: []string{selectedEntry, selectedExit},
AllowedChannels: allowedChannels,
Generation: routeGeneration,
Status: "missing_route_intent",
RecoveryPolicy: fabricServiceChannelRecoveryPolicyRef(recoveryPolicy),
PathScore: 1,
ScoreReasons: []string{"fallback_until_fabric_route_exists"},
ExpiresAt: expiresAt,
}
} else {
fallback.Active = false
fallback.Degraded = false
}
channelID := uuidLikeRandom()
if channelID == "" {
channelID = "fabric-channel-" + now.Format("20060102T150405.000000000Z")
}
token := uuidLikeRandom()
if token == "" {
token = channelID
}
lease := FabricServiceChannelLease{
SchemaVersion: "rap.fabric_service_channel_lease.v1",
ChannelID: channelID,
ClusterID: input.ClusterID,
OrganizationID: input.OrganizationID,
UserID: input.UserID,
ResourceID: input.ResourceID,
ServiceClass: input.ServiceClass,
Status: status,
SelectedEntryNodeID: selectedEntry,
SelectedExitNodeID: selectedExit,
EntryPool: fabricServiceChannelNodePool(entryNodeIDs, "entry", selectedEntry),
ExitPool: fabricServiceChannelNodePool(exitNodeIDs, "exit", selectedExit),
RequiredRoles: requiredRoles,
AllowedChannels: allowedChannels,
PrimaryRoute: primary,
AlternateRoutes: alternates,
RecoveryPolicy: fabricServiceChannelRecoveryPolicyRef(recoveryPolicy),
PoolPolicy: fabricServiceChannelPoolPolicyRef(poolPolicy),
DataPlane: fabricServiceChannelDataPlaneContract(input.ServiceClass, poolPolicy, fallback),
QoS: defaultJSON(input.QoS, defaultFabricServiceQoS(input.ServiceClass)),
Failover: defaultJSON(input.Failover, fabricServiceFailoverFromPoolPolicy(poolPolicy)),
Fallback: fallback,
Token: FabricServiceChannelToken{
Type: "control_plane_issued_bearer",
Token: "rap_fsc_" + strings.ReplaceAll(token, "-", ""),
TTLSeconds: int(ttl.Seconds()),
IntrospectionPath: "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/introspect",
},
EntryHTTP: fabricServiceChannelHTTPIngress(input.ServiceClass),
RouteGeneration: routeGeneration,
FencingEpoch: now.UnixNano(),
IssuedAt: now,
ExpiresAt: expiresAt,
Metadata: defaultJSON(input.Metadata, `{}`),
}
if signed, err := s.signFabricServiceChannelLease(ctx, lease); err == nil {
lease = signed
}
s.rememberFabricServiceChannelLease(lease)
if _, err := s.store.StoreFabricServiceChannelLease(ctx, StoreFabricServiceChannelLeaseInput{
Lease: lease,
TokenHash: fabricServiceChannelTokenHash(lease.Token.Token),
}); err != nil {
return FabricServiceChannelLease{}, err
}
return lease, nil
}
func (s *Service) rememberFabricServiceChannelLease(lease FabricServiceChannelLease) {
if strings.TrimSpace(lease.ClusterID) == "" || strings.TrimSpace(lease.ChannelID) == "" || strings.TrimSpace(lease.Token.Token) == "" {
return
}
now := s.now()
if now.IsZero() {
now = time.Now().UTC()
}
s.fabricServiceChannelLeaseMu.Lock()
defer s.fabricServiceChannelLeaseMu.Unlock()
if s.fabricServiceChannelLeaseCache == nil {
s.fabricServiceChannelLeaseCache = map[string]FabricServiceChannelLease{}
}
for key, item := range s.fabricServiceChannelLeaseCache {
if !item.ExpiresAt.IsZero() && !item.ExpiresAt.After(now) {
delete(s.fabricServiceChannelLeaseCache, key)
}
}
s.fabricServiceChannelLeaseCache[fabricServiceChannelLeaseCacheKey(lease.ClusterID, lease.ChannelID)] = lease
}
func (s *Service) IntrospectFabricServiceChannelLease(ctx context.Context, input IntrospectFabricServiceChannelLeaseInput) (FabricServiceChannelLeaseIntrospection, error) {
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.ChannelID = strings.TrimSpace(input.ChannelID)
input.ResourceID = strings.TrimSpace(input.ResourceID)
input.ServiceClass = normalizeFabricServiceClass(input.ServiceClass)
input.ChannelClass = strings.TrimSpace(strings.ToLower(input.ChannelClass))
input.Token = strings.TrimSpace(input.Token)
input.EntryNodeID = strings.TrimSpace(input.EntryNodeID)
if input.ClusterID == "" || input.ChannelID == "" || input.Token == "" {
return FabricServiceChannelLeaseIntrospection{}, ErrInvalidPayload
}
now := s.now()
if now.IsZero() {
now = time.Now().UTC()
}
s.fabricServiceChannelLeaseMu.Lock()
lease, ok := s.fabricServiceChannelLeaseCache[fabricServiceChannelLeaseCacheKey(input.ClusterID, input.ChannelID)]
tokenHash := ""
if ok && !lease.ExpiresAt.IsZero() && !lease.ExpiresAt.After(now) {
delete(s.fabricServiceChannelLeaseCache, fabricServiceChannelLeaseCacheKey(input.ClusterID, input.ChannelID))
ok = false
}
if ok {
tokenHash = fabricServiceChannelTokenHash(lease.Token.Token)
}
s.fabricServiceChannelLeaseMu.Unlock()
if !ok {
record, err := s.store.GetFabricServiceChannelLease(ctx, input.ClusterID, input.ChannelID)
if err != nil && !errors.Is(err, pgx.ErrNoRows) {
return FabricServiceChannelLeaseIntrospection{}, err
}
if err == nil {
lease = record.Lease
tokenHash = strings.TrimSpace(record.TokenHash)
if !lease.ExpiresAt.IsZero() && !lease.ExpiresAt.After(now) {
ok = false
} else {
ok = true
s.rememberFabricServiceChannelLease(lease)
}
}
}
out := FabricServiceChannelLeaseIntrospection{
SchemaVersion: "rap.fabric_service_channel_introspection.v1",
ClusterID: input.ClusterID,
ChannelID: input.ChannelID,
ResourceID: input.ResourceID,
ServiceClass: input.ServiceClass,
AcceptedBy: "introspection",
Status: "denied",
Reason: "lease_not_found",
}
if !ok {
return out, nil
}
out.ResourceID = lease.ResourceID
out.ServiceClass = lease.ServiceClass
out.SelectedEntryNodeID = lease.SelectedEntryNodeID
out.SelectedExitNodeID = lease.SelectedExitNodeID
out.AllowedChannels = append([]string{}, lease.AllowedChannels...)
out.LeaseStatus = lease.Status
out.PrimaryRoute = lease.PrimaryRoute
out.DataPlane = lease.DataPlane
out.RouteGeneration = lease.RouteGeneration
out.FencingEpoch = lease.FencingEpoch
out.ExpiresAt = lease.ExpiresAt
if lease.ClusterID != input.ClusterID ||
lease.ChannelID != input.ChannelID ||
tokenHash == "" ||
tokenHash != fabricServiceChannelTokenHash(input.Token) {
out.Reason = "lease_token_mismatch"
return out, nil
}
if lease.ResourceID != "" && input.ResourceID != "" && lease.ResourceID != input.ResourceID {
out.Reason = "resource_mismatch"
return out, nil
}
if input.ServiceClass != "" && lease.ServiceClass != input.ServiceClass {
out.Reason = "service_class_mismatch"
return out, nil
}
if input.ChannelClass != "" && !containsString(lease.AllowedChannels, input.ChannelClass) {
out.Reason = "channel_class_not_allowed"
return out, nil
}
if input.EntryNodeID != "" && lease.SelectedEntryNodeID != "" && lease.SelectedEntryNodeID != input.EntryNodeID {
out.Reason = "entry_node_mismatch"
return out, nil
}
out.Allowed = true
out.Status = "allowed"
out.Reason = "lease_introspection_allowed"
if lease.Status == FabricServiceChannelStatusDegradedFallback || lease.PrimaryRoute.Status == "missing_route_intent" {
out.ForceBackendFallback = true
} else {
out.PreferredRouteID = strings.TrimSpace(lease.PrimaryRoute.RouteID)
}
return out, nil
}
func (s *Service) ListFabricServiceChannelLeases(ctx context.Context, actorUserID string, input ListFabricServiceChannelLeasesInput) (FabricServiceChannelLeaseMaintenance, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return FabricServiceChannelLeaseMaintenance{}, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.ServiceClass = normalizeFabricServiceClass(input.ServiceClass)
input.EntryNodeID = strings.TrimSpace(input.EntryNodeID)
input.ResourceID = strings.TrimSpace(input.ResourceID)
if input.ClusterID == "" {
return FabricServiceChannelLeaseMaintenance{}, ErrInvalidPayload
}
if input.Limit <= 0 || input.Limit > 500 {
input.Limit = 100
}
now := input.Now
if now.IsZero() {
now = s.now()
}
if now.IsZero() {
now = time.Now().UTC()
}
records, err := s.store.ListFabricServiceChannelLeases(ctx, input)
if err != nil {
return FabricServiceChannelLeaseMaintenance{}, err
}
out := FabricServiceChannelLeaseMaintenance{
SchemaVersion: "rap.fabric_service_channel_lease_maintenance.v1",
ClusterID: input.ClusterID,
Status: "ready",
Reason: "lease_maintenance_ready",
ObservedAt: now.UTC(),
WindowLimit: input.Limit,
}
for _, record := range records {
summary := fabricServiceChannelLeaseSummaryFromRecord(record, now)
if summary.Expired {
out.ExpiredCount++
} else {
out.ActiveCount++
}
out.Leases = append(out.Leases, summary)
}
out.ScannedCount = len(out.Leases)
if out.ExpiredCount > 0 {
out.Status = "degraded"
out.Reason = "expired_leases_pending_cleanup"
out.RecommendedOperatorAction = "Run service-channel lease cleanup to remove expired compatibility lease records."
}
return out, nil
}
func (s *Service) CleanupFabricServiceChannelLeases(ctx context.Context, input CleanupFabricServiceChannelLeasesInput) (FabricServiceChannelLeaseMaintenance, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return FabricServiceChannelLeaseMaintenance{}, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
if input.ClusterID == "" {
return FabricServiceChannelLeaseMaintenance{}, ErrInvalidPayload
}
if input.Limit <= 0 || input.Limit > 1000 {
input.Limit = 100
}
now := input.Now
if now.IsZero() {
now = s.now()
}
if now.IsZero() {
now = time.Now().UTC()
}
deleted, err := s.store.CleanupExpiredFabricServiceChannelLeases(ctx, input.ClusterID, now.UTC(), input.Limit)
if err != nil {
return FabricServiceChannelLeaseMaintenance{}, err
}
out, err := s.ListFabricServiceChannelLeases(ctx, input.ActorUserID, ListFabricServiceChannelLeasesInput{
ClusterID: input.ClusterID,
IncludeExpired: true,
Limit: input.Limit,
Now: now,
})
if err != nil {
return FabricServiceChannelLeaseMaintenance{}, err
}
out.DeletedExpiredCount = deleted
out.Status = "ready"
out.Reason = "expired_leases_cleaned"
out.RecommendedOperatorAction = ""
if out.ExpiredCount > 0 {
out.Status = "degraded"
out.Reason = "expired_leases_remaining"
out.RecommendedOperatorAction = "Run cleanup again; expired leases remain beyond the bounded cleanup window."
}
return out, nil
}
func (s *Service) GetFabricServiceChannelAccessTelemetry(ctx context.Context, actorUserID string, input GetFabricServiceChannelAccessTelemetryInput) (FabricServiceChannelAccessTelemetry, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return FabricServiceChannelAccessTelemetry{}, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
if input.ClusterID == "" {
return FabricServiceChannelAccessTelemetry{}, ErrInvalidPayload
}
if input.Limit <= 0 || input.Limit > 200 {
input.Limit = 100
}
now := input.Now
if now.IsZero() {
now = s.now()
}
if now.IsZero() {
now = time.Now().UTC()
}
nodes, err := s.store.ListClusterNodes(ctx, input.ClusterID)
if err != nil {
return FabricServiceChannelAccessTelemetry{}, err
}
out := FabricServiceChannelAccessTelemetry{
SchemaVersion: "rap.fabric_service_channel_access_telemetry.v1",
ClusterID: input.ClusterID,
Status: "ready",
Reason: "access_telemetry_ready",
ObservedAt: now.UTC(),
NodeCount: len(nodes),
TrafficClassCounts: map[string]int{},
RecommendedParallelWindows: map[string]int{},
}
for _, node := range nodes {
if len(out.Nodes) >= input.Limit {
break
}
items, err := s.store.ListNodeTelemetry(ctx, input.ClusterID, node.ID, 5)
if err != nil {
continue
}
report := map[string]any{}
var observedAt time.Time
for _, item := range items {
payload := jsonObject(item.Payload)
report = jsonMapPath(payload, "fabric_service_channel_access_report")
if len(report) > 0 {
observedAt = item.ObservedAt
break
}
}
if len(report) == 0 {
heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, node.ID, 5)
if err == nil {
for _, heartbeat := range heartbeats {
payload := jsonObject(heartbeat.Metadata)
report = jsonMapPath(payload, "fabric_service_channel_access_report")
if len(report) > 0 {
observedAt = heartbeat.ObservedAt
break
}
}
}
}
if len(report) == 0 {
continue
}
nodeReport := FabricServiceChannelAccessTelemetryNode{
NodeID: node.ID,
NodeName: node.Name,
ObservedAt: observedAt,
TotalAccepted: jsonInt(report, "total"),
SignedAccepted: jsonInt(report, "signed"),
IntrospectionAccepted: jsonInt(report, "introspection"),
LegacyUnsignedAccepted: jsonInt(report, "legacy_unsigned"),
BackendFallbackCount: jsonInt(report, "backend_fallback"),
BackendFallbackBlockedCount: jsonInt(report, "backend_fallback_blocked"),
FabricRouteSendFailureCount: jsonInt(report, "fabric_route_send_failure"),
DataPlaneContractCount: jsonInt(report, "data_plane_contract"),
LastDataPlaneMode: jsonString(report, "last_data_plane_mode"),
LastWorkingDataTransport: jsonString(report, "last_working_data_transport"),
LastSteadyStateTransport: jsonString(report, "last_steady_state_transport"),
LastBackendRelayPolicy: jsonString(report, "last_backend_relay_policy"),
LastLogicalFlowMode: jsonString(report, "last_logical_flow_mode"),
LastDataPlaneViolationStatus: jsonString(report, "last_data_plane_violation_status"),
LastDataPlaneViolationReason: jsonString(report, "last_data_plane_violation_reason"),
}
if nodeReport.SignedAccepted == 0 {
nodeReport.SignedAccepted = jsonInt(report, "accepted_by_signed")
}
if nodeReport.IntrospectionAccepted == 0 {
nodeReport.IntrospectionAccepted = jsonInt(report, "accepted_by_introspection")
}
if nodeReport.LegacyUnsignedAccepted == 0 {
nodeReport.LegacyUnsignedAccepted = jsonInt(report, "accepted_by_legacy_unsigned")
}
if value := jsonString(report, "last_accepted_at"); value != "" {
if parsed, err := time.Parse(time.RFC3339Nano, value); err == nil {
nodeReport.LastAcceptedAt = &parsed
if out.LatestAcceptedAt == nil || parsed.After(*out.LatestAcceptedAt) {
latest := parsed
out.LatestAcceptedAt = &latest
}
}
}
if heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, node.ID, 1); err == nil && len(heartbeats) > 0 {
flowScheduler := fabricServiceChannelFlowSchedulerFromHeartbeat(heartbeats[0])
nodeReport.TrafficClassCounts = jsonStringIntMap(flowScheduler, "traffic_class_counts")
nodeReport.FlowChannelCount = jsonInt(flowScheduler, "channel_count")
nodeReport.FlowDropped = jsonInt(flowScheduler, "dropped")
nodeReport.FlowHighWatermark = jsonInt(flowScheduler, "high_watermark")
nodeReport.FlowMaxInFlight = jsonInt(flowScheduler, "max_in_flight")
nodeReport.RecommendedParallelWindows = jsonStringIntMap(flowScheduler, "recommended_parallel_windows")
nodeReport.AdaptiveBackpressureActive = jsonBool(flowScheduler, "adaptive_backpressure_active")
nodeReport.AdaptiveBackpressureReason = jsonString(flowScheduler, "adaptive_backpressure_reason")
nodeReport.AdaptivePolicyFingerprint = jsonString(flowScheduler, "adaptive_policy_fingerprint")
}
nodeReport.FlowHealthStatus, nodeReport.FlowHealthReason, _ = fabricServiceChannelFlowHealth(
nodeReport.TrafficClassCounts,
nodeReport.FlowDropped,
nodeReport.FlowHighWatermark,
nodeReport.FlowMaxInFlight,
nodeReport.BackendFallbackCount,
0,
0,
0,
0,
)
out.ReportingNodeCount++
out.TotalAccepted += nodeReport.TotalAccepted
out.SignedAccepted += nodeReport.SignedAccepted
out.IntrospectionAccepted += nodeReport.IntrospectionAccepted
out.LegacyUnsignedAccepted += nodeReport.LegacyUnsignedAccepted
out.BackendFallbackCount += nodeReport.BackendFallbackCount
out.BackendFallbackBlockedCount += nodeReport.BackendFallbackBlockedCount
out.FabricRouteSendFailureCount += nodeReport.FabricRouteSendFailureCount
out.DataPlaneContractCount += nodeReport.DataPlaneContractCount
if out.LastDataPlaneMode == "" {
out.LastDataPlaneMode = nodeReport.LastDataPlaneMode
}
if out.LastWorkingDataTransport == "" {
out.LastWorkingDataTransport = nodeReport.LastWorkingDataTransport
}
if out.LastSteadyStateTransport == "" {
out.LastSteadyStateTransport = nodeReport.LastSteadyStateTransport
}
if out.LastBackendRelayPolicy == "" {
out.LastBackendRelayPolicy = nodeReport.LastBackendRelayPolicy
}
if out.LastLogicalFlowMode == "" {
out.LastLogicalFlowMode = nodeReport.LastLogicalFlowMode
}
if out.LastDataPlaneViolationStatus == "" {
out.LastDataPlaneViolationStatus = nodeReport.LastDataPlaneViolationStatus
}
if out.LastDataPlaneViolationReason == "" {
out.LastDataPlaneViolationReason = nodeReport.LastDataPlaneViolationReason
}
mergeStringIntMap(out.TrafficClassCounts, nodeReport.TrafficClassCounts)
mergeMinStringIntMap(out.RecommendedParallelWindows, nodeReport.RecommendedParallelWindows)
if nodeReport.AdaptiveBackpressureActive {
out.AdaptiveBackpressureActive = true
if out.AdaptiveBackpressureReason == "" {
out.AdaptiveBackpressureReason = nodeReport.AdaptiveBackpressureReason
}
}
if out.AdaptivePolicyFingerprint == "" {
out.AdaptivePolicyFingerprint = nodeReport.AdaptivePolicyFingerprint
}
out.FlowChannelCount += nodeReport.FlowChannelCount
out.FlowDropped += nodeReport.FlowDropped
if nodeReport.FlowHighWatermark > out.FlowHighWatermark {
out.FlowHighWatermark = nodeReport.FlowHighWatermark
}
if nodeReport.FlowMaxInFlight > out.FlowMaxInFlight {
out.FlowMaxInFlight = nodeReport.FlowMaxInFlight
}
out.Nodes = append(out.Nodes, nodeReport)
}
if len(out.TrafficClassCounts) == 0 {
out.TrafficClassCounts = nil
}
if len(out.RecommendedParallelWindows) == 0 {
out.RecommendedParallelWindows = nil
}
nodeReportsByID := map[string]FabricServiceChannelAccessTelemetryNode{}
for _, node := range out.Nodes {
nodeReportsByID[node.NodeID] = node
}
routeManagerByNodeID := map[string]map[string]any{}
routeManagerTransitionByNodeID := map[string]map[string]any{}
for _, node := range nodes {
heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, node.ID, 1)
if err != nil || len(heartbeats) == 0 {
continue
}
metadata := jsonObject(heartbeats[0].Metadata)
runtime := jsonMapPath(metadata, "fabric_service_channel_runtime_report")
ingress := jsonMapPath(runtime, "ingress")
routeManager := jsonMapPath(ingress, "route_manager")
if len(routeManager) > 0 {
routeManagerByNodeID[node.ID] = routeManager
}
transition := jsonMapPath(ingress, "route_manager_transition")
if len(transition) > 0 {
routeManagerTransitionByNodeID[node.ID] = transition
}
}
feedbackItems, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{
ClusterID: input.ClusterID,
ServiceClass: FabricServiceClassVPNPackets,
Now: now.UTC(),
IncludeExpired: false,
})
if err != nil {
return FabricServiceChannelAccessTelemetry{}, err
}
feedbackByRouteID := map[string]FabricServiceChannelRouteFeedbackObservation{}
for _, item := range feedbackItems {
if strings.TrimSpace(item.RouteID) == "" {
continue
}
current, ok := feedbackByRouteID[item.RouteID]
if !ok || item.ObservedAt.After(current.ObservedAt) {
feedbackByRouteID[item.RouteID] = item
}
}
leaseRecords, err := s.store.ListFabricServiceChannelLeases(ctx, ListFabricServiceChannelLeasesInput{
ClusterID: input.ClusterID,
IncludeExpired: false,
Limit: input.Limit,
Now: now.UTC(),
})
if err != nil {
return FabricServiceChannelAccessTelemetry{}, err
}
for _, record := range leaseRecords {
summary := fabricServiceChannelLeaseSummaryFromRecord(record, now)
channel := FabricServiceChannelAccessTelemetryChannel{
ChannelID: summary.ChannelID,
ResourceID: summary.ResourceID,
ServiceClass: summary.ServiceClass,
Status: summary.Status,
SelectedEntryNodeID: summary.SelectedEntryNodeID,
SelectedExitNodeID: summary.SelectedExitNodeID,
PrimaryRouteID: summary.PrimaryRouteID,
PrimaryRouteStatus: summary.PrimaryRouteStatus,
ForceBackendFallback: summary.ForceBackendFallback,
DataPlane: summary.DataPlane,
ExpiresAt: summary.ExpiresAt,
}
if record.Lease.PoolPolicy != nil {
channel.PoolPolicyFingerprint = record.Lease.PoolPolicy.Fingerprint
}
if entryReport, ok := nodeReportsByID[channel.SelectedEntryNodeID]; ok {
channel.EntryNodeTotalAccepted = entryReport.TotalAccepted
channel.EntryNodeIntrospectionAccepted = entryReport.IntrospectionAccepted
channel.EntryNodeBackendFallbackCount = entryReport.BackendFallbackCount
channel.EntryNodeBackendFallbackBlockedCount = entryReport.BackendFallbackBlockedCount
channel.EntryNodeFabricRouteSendFailureCount = entryReport.FabricRouteSendFailureCount
channel.EntryNodeDataPlaneContractCount = entryReport.DataPlaneContractCount
channel.EntryNodeLastDataPlaneMode = entryReport.LastDataPlaneMode
channel.EntryNodeLastWorkingDataTransport = entryReport.LastWorkingDataTransport
channel.EntryNodeLastSteadyStateTransport = entryReport.LastSteadyStateTransport
channel.EntryNodeLastBackendRelayPolicy = entryReport.LastBackendRelayPolicy
channel.EntryNodeLastLogicalFlowMode = entryReport.LastLogicalFlowMode
channel.EntryNodeLastDataPlaneViolationStatus = entryReport.LastDataPlaneViolationStatus
channel.EntryNodeLastDataPlaneViolationReason = entryReport.LastDataPlaneViolationReason
channel.EntryNodeTrafficClassCounts = copyStringIntMap(entryReport.TrafficClassCounts)
channel.EntryNodeFlowChannelCount = entryReport.FlowChannelCount
channel.EntryNodeFlowDropped = entryReport.FlowDropped
channel.EntryNodeFlowHighWatermark = entryReport.FlowHighWatermark
channel.EntryNodeFlowMaxInFlight = entryReport.FlowMaxInFlight
channel.EntryNodeFlowHealthStatus = entryReport.FlowHealthStatus
channel.EntryNodeFlowHealthReason = entryReport.FlowHealthReason
channel.EntryNodeRecommendedParallelWindows = copyStringIntMap(entryReport.RecommendedParallelWindows)
channel.EntryNodeAdaptiveBackpressureActive = entryReport.AdaptiveBackpressureActive
channel.EntryNodeAdaptiveBackpressureReason = entryReport.AdaptiveBackpressureReason
channel.EntryNodeAdaptivePolicyFingerprint = entryReport.AdaptivePolicyFingerprint
}
if feedback, ok := feedbackByRouteID[channel.PrimaryRouteID]; ok {
observedAt := feedback.ObservedAt
channel.RouteFeedbackStatus = feedback.FeedbackStatus
channel.RouteFeedbackObservedAt = &observedAt
channel.RouteFeedbackScoreAdjustment = feedback.ScoreAdjustment
channel.RouteFeedbackEffectiveScoreAdjustment = feedback.EffectiveScoreAdjustment
channel.RouteFeedbackReasons = append([]string{}, feedback.Reasons...)
channel.RouteQualityWindowSampleCount = fabricServiceChannelFeedbackPayloadInt(feedback.Payload, "quality_window_sample_count")
channel.RouteQualityWindowFailureCount = fabricServiceChannelFeedbackPayloadInt(feedback.Payload, "quality_window_failure_count")
channel.RouteQualityWindowDropCount = fabricServiceChannelFeedbackPayloadInt(feedback.Payload, "quality_window_drop_count")
channel.RouteQualityWindowSlowCount = fabricServiceChannelFeedbackPayloadInt(feedback.Payload, "quality_window_slow_count")
channel.LastSendDurationMs = feedback.LastSendDurationMs
channel.EntryNodeFlowHealthStatus, channel.EntryNodeFlowHealthReason, _ = fabricServiceChannelFlowHealth(
channel.EntryNodeTrafficClassCounts,
channel.EntryNodeFlowDropped,
channel.EntryNodeFlowHighWatermark,
channel.EntryNodeFlowMaxInFlight,
channel.EntryNodeBackendFallbackCount,
channel.LastSendDurationMs,
channel.RouteQualityWindowFailureCount,
channel.RouteQualityWindowDropCount,
channel.RouteQualityWindowSlowCount,
)
out.CorrelatedRouteCount++
if feedback.FeedbackStatus == "degraded" || feedback.FeedbackStatus == "fenced" || feedback.EffectiveScoreAdjustment < 0 || feedback.ScoreAdjustment < 0 {
out.DegradedRouteCount++
}
}
channel = fabricServiceChannelAccessRemediation(channel, record.Lease, now)
channel = fabricServiceChannelAccessRouteDecisionTelemetry(channel, routeManagerByNodeID[channel.SelectedEntryNodeID], routeManagerTransitionByNodeID[channel.SelectedEntryNodeID])
channel = fabricServiceChannelAccessRemediationExecution(channel, routeManagerByNodeID[channel.SelectedEntryNodeID], routeManagerTransitionByNodeID[channel.SelectedEntryNodeID], now)
channel = s.fabricServiceChannelAccessRemediationLedgerExecution(ctx, input.ClusterID, channel)
fabricServiceChannelAccumulateRouteDecisionTelemetry(&out, channel)
if channel.ForceBackendFallback {
out.DegradedFallbackChannelCount++
}
out.ActiveChannels = append(out.ActiveChannels, channel)
}
out.ActiveChannelCount = len(out.ActiveChannels)
sort.Slice(out.Nodes, func(i, j int) bool {
if out.Nodes[i].TotalAccepted != out.Nodes[j].TotalAccepted {
return out.Nodes[i].TotalAccepted > out.Nodes[j].TotalAccepted
}
return out.Nodes[i].NodeName < out.Nodes[j].NodeName
})
sort.Slice(out.ActiveChannels, func(i, j int) bool {
if out.ActiveChannels[i].ForceBackendFallback != out.ActiveChannels[j].ForceBackendFallback {
return out.ActiveChannels[i].ForceBackendFallback
}
if out.ActiveChannels[i].RouteFeedbackStatus != out.ActiveChannels[j].RouteFeedbackStatus {
return out.ActiveChannels[i].RouteFeedbackStatus > out.ActiveChannels[j].RouteFeedbackStatus
}
return out.ActiveChannels[i].ExpiresAt.Before(out.ActiveChannels[j].ExpiresAt)
})
if out.NoSafeRecoveryDecisionCount > 0 {
out.Status = "degraded"
out.Reason = "active_channels_no_safe_recovery"
out.RecommendedOperatorAction = "Inspect active service-channel route decisions; at least one channel has no safe recovery route."
} else if out.ReportingNodeCount == 0 {
out.Status = "degraded"
out.Reason = "no_access_telemetry_reported"
out.RecommendedOperatorAction = "Wait for node telemetry or verify fabric_service_channel_access_telemetry capability on node-agent."
} else if out.DegradedFallbackChannelCount > 0 || out.DegradedRouteCount > 0 {
out.Status = "degraded"
out.Reason = "active_channels_degraded"
out.RecommendedOperatorAction = "Inspect active service-channel routes with backend fallback or degraded route-quality feedback."
}
out.FlowHealthStatus, out.FlowHealthReason, _ = fabricServiceChannelFlowHealth(
out.TrafficClassCounts,
out.FlowDropped,
out.FlowHighWatermark,
out.FlowMaxInFlight,
out.BackendFallbackCount,
0,
0,
0,
0,
)
for _, channel := range out.ActiveChannels {
out.FlowHealthStatus, out.FlowHealthReason = fabricServiceChannelWorseFlowHealth(out.FlowHealthStatus, out.FlowHealthReason, channel.EntryNodeFlowHealthStatus, channel.EntryNodeFlowHealthReason)
}
if out.FlowHealthStatus == "critical" || out.FlowHealthStatus == "degraded" {
out.Status = "degraded"
if out.Reason == "access_telemetry_ready" {
out.Reason = "flow_health_degraded"
}
if out.RecommendedOperatorAction == "" {
out.RecommendedOperatorAction = fabricServiceChannelFlowHealthAction(out.FlowHealthStatus, out.FlowHealthReason)
}
} else if out.FlowHealthStatus == "watch" && out.RecommendedOperatorAction == "" {
out.RecommendedOperatorAction = fabricServiceChannelFlowHealthAction(out.FlowHealthStatus, out.FlowHealthReason)
}
return out, nil
}
func fabricServiceChannelFlowHealth(trafficClassCounts map[string]int, flowDropped, flowHighWatermark, flowMaxInFlight, backendFallbackCount int, lastSendDurationMs int64, routeFailureCount, routeDropCount, routeSlowCount int) (string, string, string) {
switch {
case flowDropped > 0:
return "critical", "flow_drops_reported", fabricServiceChannelFlowHealthAction("critical", "flow_drops_reported")
case routeDropCount > 0:
return "critical", "route_quality_window_drops_reported", fabricServiceChannelFlowHealthAction("critical", "route_quality_window_drops_reported")
case backendFallbackCount > 0:
return "degraded", "backend_fallback_observed", fabricServiceChannelFlowHealthAction("degraded", "backend_fallback_observed")
case routeFailureCount > 0:
return "degraded", "route_quality_window_failures_reported", fabricServiceChannelFlowHealthAction("degraded", "route_quality_window_failures_reported")
case routeSlowCount > 0:
return "degraded", "route_quality_window_slow_samples_reported", fabricServiceChannelFlowHealthAction("degraded", "route_quality_window_slow_samples_reported")
case lastSendDurationMs >= 1000:
return "degraded", "route_send_latency_high", fabricServiceChannelFlowHealthAction("degraded", "route_send_latency_high")
}
bulk := trafficClassCounts["bulk"]
interactive := trafficClassCounts["interactive"] + trafficClassCounts["control"]
switch {
case flowHighWatermark >= 64 || flowMaxInFlight >= 16:
return "degraded", "flow_queue_pressure_high", fabricServiceChannelFlowHealthAction("degraded", "flow_queue_pressure_high")
case bulk >= 16 && interactive > 0:
return "watch", "bulk_pressure_with_interactive_qos_observed", fabricServiceChannelFlowHealthAction("watch", "bulk_pressure_with_interactive_qos_observed")
case bulk >= 16:
return "watch", "bulk_pressure_observed", fabricServiceChannelFlowHealthAction("watch", "bulk_pressure_observed")
case flowHighWatermark >= 16 || flowMaxInFlight >= 4:
return "watch", "flow_queue_pressure_observed", fabricServiceChannelFlowHealthAction("watch", "flow_queue_pressure_observed")
default:
return "healthy", "flow_health_ready", fabricServiceChannelFlowHealthAction("healthy", "flow_health_ready")
}
}
func fabricServiceChannelWorseFlowHealth(currentStatus, currentReason, candidateStatus, candidateReason string) (string, string) {
if candidateStatus == "" {
return currentStatus, currentReason
}
if fabricServiceChannelFlowHealthRank(candidateStatus) > fabricServiceChannelFlowHealthRank(currentStatus) {
return candidateStatus, candidateReason
}
return currentStatus, currentReason
}
func fabricServiceChannelFlowHealthRank(status string) int {
switch status {
case "critical":
return 4
case "degraded":
return 3
case "watch":
return 2
case "healthy":
return 1
default:
return 0
}
}
func fabricServiceChannelFlowHealthAction(status, reason string) string {
switch status {
case "critical":
return "Reduce or reroute service-channel pressure immediately; inspect flow drops, route drops, and backend fallback before adding user traffic."
case "degraded":
return "Inspect service-channel route quality and active entry-node pressure; prefer alternate route or rebuild when degraded evidence persists."
case "watch":
if reason == "bulk_pressure_with_interactive_qos_observed" {
return "Bulk pressure is active while interactive/control remains observable; keep watching latency and drops before increasing load."
}
return "Bulk or queue pressure is visible; monitor interactive/control traffic before increasing production load."
default:
return "Flow health is within the current service-channel guard policy."
}
}
func fabricServiceChannelAccessRemediation(channel FabricServiceChannelAccessTelemetryChannel, lease FabricServiceChannelLease, now time.Time) FabricServiceChannelAccessTelemetryChannel {
if channel.ForceBackendFallback {
channel.RemediationAction = "use_backend_fallback"
channel.RemediationReason = "explicit_backend_fallback_active"
channel.RecommendedOperatorAction = "Inspect missing/fenced fabric route and keep backend fallback visible until a normal route is available."
channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now)
return channel
}
degraded := channel.RouteFeedbackStatus == "degraded" || channel.RouteFeedbackStatus == "fenced" ||
channel.RouteFeedbackScoreAdjustment < 0 || channel.RouteFeedbackEffectiveScoreAdjustment < 0
if !degraded {
channel.RemediationAction = "none"
channel.RemediationReason = "active_route_quality_acceptable"
channel.RecommendedOperatorAction = "No route remediation required."
return channel
}
if containsString(channel.RouteFeedbackReasons, "service_channel_degraded_fallback_recommended") {
channel.RemediationAction = "use_backend_fallback"
channel.RemediationReason = "route_feedback_recommends_degraded_fallback"
channel.RecommendedOperatorAction = "Use explicit degraded backend fallback while route rebuild catches up."
channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now)
return channel
}
if alternate, ok := fabricServiceChannelFirstAuthorizedAlternate(lease.AlternateRoutes, channel.PrimaryRouteID); ok {
guardStatus, guardReason := fabricServiceChannelRouteAllowedByLeasePool(lease, alternate)
if guardStatus != "allowed" {
channel.RemediationAction = "rebuild_route"
channel.RemediationReason = "alternate_route_rejected_by_pool_policy"
channel.RemediationRouteID = alternate.RouteID
channel.RemediationRouteStatus = alternate.Status
channel.RemediationGuardStatus = guardStatus
channel.RemediationGuardReason = guardReason
channel.RecommendedOperatorAction = "Reject the alternate route and rebuild within the signed entry/exit pool policy."
channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now)
return channel
}
channel.RemediationAction = "prefer_alternate_route"
channel.RemediationReason = "authorized_alternate_route_available"
channel.RemediationRouteID = alternate.RouteID
channel.RemediationRouteStatus = alternate.Status
channel.RemediationGuardStatus = guardStatus
channel.RemediationGuardReason = guardReason
channel.RecommendedOperatorAction = "Prefer the authorized alternate route for this active service channel."
channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now)
return channel
}
if containsString(channel.RouteFeedbackReasons, "service_channel_route_rebuild_recommended") || channel.RouteFeedbackStatus == "fenced" {
channel.RemediationAction = "rebuild_route"
channel.RemediationReason = "route_feedback_recommends_rebuild"
channel.RecommendedOperatorAction = "Trigger or wait for route rebuild; keep this distinct from backend fallback."
channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now)
return channel
}
channel.RemediationAction = "inspect_route_quality"
channel.RemediationReason = "degraded_route_quality_without_replacement"
channel.RecommendedOperatorAction = "Inspect rolling route quality counters and route feedback provenance."
channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now)
return channel
}
func fabricServiceChannelAccessRemediationCommand(channel FabricServiceChannelAccessTelemetryChannel, lease FabricServiceChannelLease, now time.Time) *FabricServiceChannelAccessRemediationCommand {
action := strings.TrimSpace(channel.RemediationAction)
if action == "" || action == "none" {
return nil
}
if now.IsZero() {
now = time.Now().UTC()
}
issuedAt := now.UTC()
expiresAt := issuedAt.Add(60 * time.Second)
if !channel.ExpiresAt.IsZero() && channel.ExpiresAt.Before(expiresAt) {
expiresAt = channel.ExpiresAt.UTC()
}
routeComponent := firstNonEmptyString(channel.RemediationRouteID, channel.PrimaryRouteID, "no-route")
return &FabricServiceChannelAccessRemediationCommand{
SchemaVersion: "rap.fabric_service_channel_access_remediation_command.v1",
CommandID: "fsc-remediation:" + channel.ChannelID + ":" + action + ":" + routeComponent,
Action: action,
ClusterID: lease.ClusterID,
ChannelID: channel.ChannelID,
ResourceID: channel.ResourceID,
ServiceClass: channel.ServiceClass,
EntryNodeID: channel.SelectedEntryNodeID,
ExitNodeID: channel.SelectedExitNodeID,
PrimaryRouteID: channel.PrimaryRouteID,
ReplacementRouteID: channel.RemediationRouteID,
ReplacementRouteStatus: channel.RemediationRouteStatus,
PoolPolicyFingerprint: channel.PoolPolicyFingerprint,
GuardStatus: firstNonEmptyString(channel.RemediationGuardStatus, "allowed"),
GuardReason: firstNonEmptyString(channel.RemediationGuardReason, "lease_pool_policy_allows_route"),
ExecutionStatus: channel.RemediationExecutionStatus,
ExecutionReason: channel.RemediationExecutionReason,
ExecutionGeneration: channel.RemediationExecutionGeneration,
ExecutionObservedAt: channel.RemediationExecutionObservedAt,
Reason: channel.RemediationReason,
OperatorAction: channel.RecommendedOperatorAction,
IssuedAt: issuedAt,
ExpiresAt: expiresAt,
}
}
func fabricServiceChannelAccessRemediationExecution(channel FabricServiceChannelAccessTelemetryChannel, routeManager map[string]any, transition map[string]any, now time.Time) FabricServiceChannelAccessTelemetryChannel {
if channel.RemediationCommand == nil {
return channel
}
if !channel.RemediationCommand.ExpiresAt.IsZero() && !now.IsZero() && !channel.RemediationCommand.ExpiresAt.After(now.UTC()) {
channel.RemediationExecutionStatus = "expired"
channel.RemediationExecutionReason = "remediation_command_ttl_expired"
return fabricServiceChannelSyncRemediationCommandExecution(channel)
}
if channel.RemediationGuardStatus == "rejected" || channel.RemediationCommand.GuardStatus == "rejected" {
channel.RemediationExecutionStatus = "rejected_by_policy_guard"
channel.RemediationExecutionReason = firstNonEmptyString(channel.RemediationGuardReason, channel.RemediationCommand.GuardReason, "remediation_guard_rejected")
return fabricServiceChannelSyncRemediationCommandExecution(channel)
}
switch channel.RemediationCommand.Action {
case "prefer_alternate_route":
if decision, ok := fabricServiceChannelRouteManagerDecisionForCommand(routeManager, *channel.RemediationCommand); ok {
channel.RemediationExecutionStatus = firstNonEmptyString(jsonString(decision, "rebuild_status"), "observed")
channel.RemediationExecutionReason = firstNonEmptyString(jsonString(decision, "rebuild_reason"), jsonString(decision, "decision_source"), "route_manager_decision_observed")
channel.RemediationExecutionGeneration = jsonString(decision, "generation")
channel.RemediationExecutionObservedAt = firstNonEmptyString(jsonString(routeManager, "last_applied_at"), jsonString(transition, "observed_at"))
return fabricServiceChannelSyncRemediationCommandExecution(channel)
}
channel.RemediationExecutionStatus = "waiting_node_apply"
channel.RemediationExecutionReason = "route_manager_has_not_reported_command"
channel.RemediationExecutionObservedAt = jsonString(transition, "observed_at")
case "rebuild_route":
if decision, ok := fabricServiceChannelRouteManagerDecisionForCommand(routeManager, *channel.RemediationCommand); ok {
channel.RemediationExecutionStatus = firstNonEmptyString(jsonString(decision, "rebuild_status"), "pending_rebuild_request")
channel.RemediationExecutionReason = firstNonEmptyString(jsonString(decision, "rebuild_reason"), jsonString(decision, "decision_source"), "route_manager_rebuild_decision_observed")
channel.RemediationExecutionGeneration = jsonString(decision, "generation")
channel.RemediationExecutionObservedAt = firstNonEmptyString(jsonString(routeManager, "last_applied_at"), jsonString(transition, "observed_at"))
return fabricServiceChannelSyncRemediationCommandExecution(channel)
}
channel.RemediationExecutionStatus = "pending_rebuild_request"
channel.RemediationExecutionReason = "bounded_rebuild_route_command_visible"
channel.RemediationExecutionObservedAt = jsonString(transition, "observed_at")
case "use_backend_fallback":
channel.RemediationExecutionStatus = "degraded_fallback_visible"
channel.RemediationExecutionReason = "backend_fallback_command_visible"
default:
channel.RemediationExecutionStatus = "visible"
channel.RemediationExecutionReason = "remediation_command_visible"
}
return fabricServiceChannelSyncRemediationCommandExecution(channel)
}
func fabricServiceChannelAccessRouteDecisionTelemetry(channel FabricServiceChannelAccessTelemetryChannel, routeManager map[string]any, transition map[string]any) FabricServiceChannelAccessTelemetryChannel {
decision, ok := fabricServiceChannelRouteManagerDecisionForChannel(routeManager, channel)
if !ok {
return channel
}
channel.RouteDecisionSource = jsonString(decision, "decision_source")
channel.RouteDecisionRouteID = jsonString(decision, "route_id")
channel.RouteDecisionReplacementRouteID = jsonString(decision, "replacement_route_id")
channel.RouteDecisionRebuildStatus = jsonString(decision, "rebuild_status")
channel.RouteDecisionRebuildReason = jsonString(decision, "rebuild_reason")
channel.RouteDecisionGeneration = firstNonEmptyString(jsonString(decision, "generation"), jsonString(decision, "rebuild_request_id"))
channel.RouteDecisionScoreReasons = jsonStringArray(decision, "score_reasons")
if channel.RemediationExecutionObservedAt == "" {
channel.RemediationExecutionObservedAt = firstNonEmptyString(jsonString(routeManager, "last_applied_at"), jsonString(transition, "observed_at"))
}
if channel.RouteDecisionSource == "service_channel_feedback_no_alternate" ||
channel.RouteDecisionRebuildStatus == "pending_degraded_fallback" ||
containsString(channel.RouteDecisionScoreReasons, "no_unfenced_alternate_route") {
channel.RemediationAction = firstNonEmptyString(channel.RemediationAction, "use_backend_fallback")
if channel.RemediationAction == "none" {
channel.RemediationAction = "use_backend_fallback"
}
channel.RemediationReason = "route_decision_no_safe_recovery"
channel.RemediationExecutionStatus = "route_rebuild_no_safe_recovery"
channel.RemediationExecutionReason = firstNonEmptyString(channel.RouteDecisionRebuildReason, "no_unfenced_alternate_route")
channel.RemediationExecutionGeneration = channel.RouteDecisionGeneration
channel.RecommendedOperatorAction = "No safe recovery route is available; keep degraded fallback visible and rebuild the route pool."
}
return channel
}
func fabricServiceChannelRouteManagerDecisionForChannel(routeManager map[string]any, channel FabricServiceChannelAccessTelemetryChannel) (map[string]any, bool) {
decisionsRaw := jsonArray(routeManager, "decisions")
if len(decisionsRaw) == 0 {
return nil, false
}
var selected map[string]any
selectedRank := 0
for _, raw := range decisionsRaw {
decision, ok := raw.(map[string]any)
if !ok || !fabricServiceChannelRouteManagerDecisionMatchesChannel(decision, channel) {
continue
}
rank := fabricServiceChannelRouteManagerDecisionTelemetryRank(decision)
if rank > selectedRank {
selected = decision
selectedRank = rank
}
}
if selected == nil {
return nil, false
}
return selected, true
}
func fabricServiceChannelRouteManagerDecisionMatchesChannel(decision map[string]any, channel FabricServiceChannelAccessTelemetryChannel) bool {
routeID := jsonString(decision, "route_id")
replacementRouteID := jsonString(decision, "replacement_route_id")
if routeID != "" && routeID == channel.PrimaryRouteID {
return true
}
if replacementRouteID != "" && replacementRouteID == channel.PrimaryRouteID {
return true
}
sourceNodeID := jsonString(decision, "source_node_id")
destinationNodeID := jsonString(decision, "destination_node_id")
localNodeID := jsonString(decision, "local_node_id")
return sourceNodeID != "" &&
destinationNodeID != "" &&
sourceNodeID == channel.SelectedEntryNodeID &&
destinationNodeID == channel.SelectedExitNodeID &&
(localNodeID == "" || localNodeID == channel.SelectedEntryNodeID)
}
func fabricServiceChannelRouteManagerDecisionTelemetryRank(decision map[string]any) int {
source := jsonString(decision, "decision_source")
status := jsonString(decision, "rebuild_status")
reasons := jsonStringArray(decision, "score_reasons")
switch {
case source == "service_channel_feedback_no_alternate" ||
status == "pending_degraded_fallback" ||
containsString(reasons, "no_unfenced_alternate_route"):
return 50
case status == "applied" || containsString(reasons, "service_channel_rebuild_applied"):
return 40
case strings.Contains(source, "replacement"):
return 30
case status != "":
return 20
default:
return 10
}
}
func fabricServiceChannelAccumulateRouteDecisionTelemetry(out *FabricServiceChannelAccessTelemetry, channel FabricServiceChannelAccessTelemetryChannel) {
if out == nil || channel.RouteDecisionSource == "" {
return
}
out.RouteDecisionChannelCount++
if fabricServiceChannelRouteDecisionIsReplacement(channel) {
out.ReplacementDecisionCount++
}
if channel.RouteDecisionRebuildStatus == "applied" || containsString(channel.RouteDecisionScoreReasons, "service_channel_rebuild_applied") {
out.AppliedRebuildDecisionCount++
}
if fabricServiceChannelRouteDecisionIsRecovery(channel) {
out.RecoveryDecisionCount++
}
if fabricServiceChannelRouteDecisionIsNoSafeRecovery(channel) {
out.NoSafeRecoveryDecisionCount++
}
}
func fabricServiceChannelRouteDecisionIsReplacement(channel FabricServiceChannelAccessTelemetryChannel) bool {
return strings.Contains(channel.RouteDecisionSource, "replacement") ||
strings.TrimSpace(channel.RouteDecisionReplacementRouteID) != ""
}
func fabricServiceChannelRouteDecisionIsRecovery(channel FabricServiceChannelAccessTelemetryChannel) bool {
return containsString(channel.RouteDecisionScoreReasons, "service_channel_recovery_promoted") ||
containsString(channel.RouteDecisionScoreReasons, "service_channel_recovery_hysteresis") ||
strings.Contains(channel.RouteDecisionRebuildReason, "recovery")
}
func fabricServiceChannelRouteDecisionIsNoSafeRecovery(channel FabricServiceChannelAccessTelemetryChannel) bool {
return channel.RouteDecisionSource == "service_channel_feedback_no_alternate" ||
channel.RouteDecisionRebuildStatus == "pending_degraded_fallback" ||
containsString(channel.RouteDecisionScoreReasons, "no_unfenced_alternate_route")
}
func fabricServiceChannelSyncRemediationCommandExecution(channel FabricServiceChannelAccessTelemetryChannel) FabricServiceChannelAccessTelemetryChannel {
if channel.RemediationCommand == nil {
return channel
}
channel.RemediationCommand.ExecutionStatus = channel.RemediationExecutionStatus
channel.RemediationCommand.ExecutionReason = channel.RemediationExecutionReason
channel.RemediationCommand.ExecutionGeneration = channel.RemediationExecutionGeneration
channel.RemediationCommand.ExecutionObservedAt = channel.RemediationExecutionObservedAt
return channel
}
func fabricServiceChannelRouteManagerDecisionForCommand(routeManager map[string]any, command FabricServiceChannelAccessRemediationCommand) (map[string]any, bool) {
decisionsRaw, ok := routeManager["decisions"].([]any)
if !ok {
return nil, false
}
for _, raw := range decisionsRaw {
decision, ok := raw.(map[string]any)
if !ok {
continue
}
if command.CommandID != "" && jsonString(decision, "rebuild_request_id") == command.CommandID {
return decision, true
}
if jsonString(decision, "route_id") == command.PrimaryRouteID &&
jsonString(decision, "replacement_route_id") == command.ReplacementRouteID &&
jsonString(decision, "decision_source") == "service_channel_remediation_command" {
return decision, true
}
}
return nil, false
}
func (s *Service) fabricServiceChannelAccessRemediationLedgerExecution(ctx context.Context, clusterID string, channel FabricServiceChannelAccessTelemetryChannel) FabricServiceChannelAccessTelemetryChannel {
if channel.RemediationCommand == nil || channel.RemediationCommand.Action != "rebuild_route" {
return channel
}
attempts, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{
ClusterID: clusterID,
ReporterNodeID: channel.SelectedEntryNodeID,
RouteID: channel.PrimaryRouteID,
ServiceClass: channel.ServiceClass,
RebuildRequestID: channel.RemediationCommand.CommandID,
Limit: 1,
})
if err != nil || len(attempts) == 0 {
return channel
}
attempt := attempts[0]
switch attempt.RebuildStatus {
case "requested":
if channel.RemediationExecutionStatus == "pending_degraded_fallback" {
channel.RemediationExecutionStatus = "rebuild_request_recorded_node_pending"
channel.RemediationExecutionReason = firstNonEmptyString(channel.RemediationExecutionReason, attempt.RebuildReason, "durable_rebuild_route_request_recorded_and_node_pending")
} else {
channel.RemediationExecutionStatus = "rebuild_request_recorded"
channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_request_recorded")
}
case "rejected":
channel.RemediationExecutionStatus = "rebuild_request_rejected"
channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_request_rejected")
case "applied":
channel.RemediationExecutionStatus = "rebuild_request_applied"
channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_request_applied")
case "no_alternate":
channel.RemediationExecutionStatus = "rebuild_request_no_alternate"
channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_no_alternate")
case "deferred_by_policy":
channel.RemediationExecutionStatus = "rebuild_request_deferred_by_policy"
channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_deferred_by_policy")
case "expired":
channel.RemediationExecutionStatus = "rebuild_request_expired"
channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_expired")
default:
channel.RemediationExecutionStatus = firstNonEmptyString(attempt.RebuildStatus, channel.RemediationExecutionStatus)
channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, channel.RemediationExecutionReason)
}
channel.RemediationExecutionGeneration = firstNonEmptyString(attempt.Generation, channel.RemediationExecutionGeneration)
if !attempt.UpdatedAt.IsZero() {
channel.RemediationExecutionObservedAt = attempt.UpdatedAt.UTC().Format(time.RFC3339Nano)
}
return fabricServiceChannelSyncRemediationCommandExecution(channel)
}
func (s *Service) fabricServiceChannelRemediationCommandsForNode(ctx context.Context, clusterID string, nodeID string, feedback map[string]fabricServiceChannelRouteFeedback, now time.Time) ([]FabricServiceChannelAccessRemediationCommand, error) {
records, err := s.store.ListFabricServiceChannelLeases(ctx, ListFabricServiceChannelLeasesInput{
ClusterID: clusterID,
EntryNodeID: nodeID,
ServiceClass: FabricServiceClassVPNPackets,
IncludeExpired: false,
Limit: 100,
Now: now.UTC(),
})
if err != nil {
return nil, err
}
commands := make([]FabricServiceChannelAccessRemediationCommand, 0, len(records))
for _, record := range records {
summary := fabricServiceChannelLeaseSummaryFromRecord(record, now)
if summary.Expired || strings.TrimSpace(summary.PrimaryRouteID) == "" {
continue
}
channel := FabricServiceChannelAccessTelemetryChannel{
ChannelID: summary.ChannelID,
ResourceID: summary.ResourceID,
ServiceClass: summary.ServiceClass,
Status: summary.Status,
SelectedEntryNodeID: summary.SelectedEntryNodeID,
SelectedExitNodeID: summary.SelectedExitNodeID,
PrimaryRouteID: summary.PrimaryRouteID,
PrimaryRouteStatus: summary.PrimaryRouteStatus,
ForceBackendFallback: summary.ForceBackendFallback,
ExpiresAt: summary.ExpiresAt,
}
if record.Lease.PoolPolicy != nil {
channel.PoolPolicyFingerprint = record.Lease.PoolPolicy.Fingerprint
}
if item, ok := feedback[channel.PrimaryRouteID]; ok {
observedAt := item.ObservedAt
channel.RouteFeedbackObservedAt = &observedAt
if item.Fenced {
channel.RouteFeedbackStatus = "fenced"
} else if item.ScoreAdjustment < 0 {
channel.RouteFeedbackStatus = "degraded"
} else if item.RouteID != "" {
channel.RouteFeedbackStatus = "healthy"
}
channel.RouteFeedbackScoreAdjustment = item.ScoreAdjustment
channel.RouteFeedbackEffectiveScoreAdjustment = item.ScoreAdjustment
channel.RouteFeedbackReasons = append([]string{}, item.Reasons...)
channel.RouteQualityWindowSampleCount = item.QualityWindowSampleCount
channel.RouteQualityWindowFailureCount = item.QualityWindowFailureCount
channel.RouteQualityWindowDropCount = item.QualityWindowDropCount
channel.RouteQualityWindowSlowCount = item.QualityWindowSlowCount
channel.LastSendDurationMs = item.LastSendDurationMs
}
channel = fabricServiceChannelAccessRemediation(channel, record.Lease, now)
if channel.RemediationCommand != nil {
commands = append(commands, *channel.RemediationCommand)
}
}
sort.SliceStable(commands, func(i, j int) bool {
if commands[i].Action != commands[j].Action {
return commands[i].Action < commands[j].Action
}
return commands[i].CommandID < commands[j].CommandID
})
return commands, nil
}
func (s *Service) recordFabricServiceChannelRemediationRebuildIntents(ctx context.Context, clusterID string, nodeID string, commands []FabricServiceChannelAccessRemediationCommand, now time.Time) error {
if len(commands) == 0 {
return nil
}
if now.IsZero() {
now = time.Now().UTC()
}
for _, command := range commands {
if command.Action != "rebuild_route" || strings.TrimSpace(command.CommandID) == "" || strings.TrimSpace(command.PrimaryRouteID) == "" {
continue
}
rebuildStatus := "requested"
outcome := "rebuild_requested"
if command.GuardStatus == "rejected" {
rebuildStatus = "rejected"
outcome = "policy_guard_rejected"
}
payload := mustJSONRaw(map[string]any{
"schema_version": "c18z75.service_channel_remediation_rebuild_intent.v1",
"command_id": command.CommandID,
"channel_id": command.ChannelID,
"resource_id": command.ResourceID,
"entry_node_id": command.EntryNodeID,
"exit_node_id": command.ExitNodeID,
"pool_policy_fingerprint": command.PoolPolicyFingerprint,
"guard_status": command.GuardStatus,
"guard_reason": command.GuardReason,
"command_expires_at": command.ExpiresAt.UTC().Format(time.RFC3339Nano),
"recorded_at": now.UTC().Format(time.RFC3339Nano),
})
_, err := s.store.RecordFabricServiceChannelRouteRebuildAttempt(ctx, RecordFabricServiceChannelRouteRebuildAttemptInput{
ClusterID: clusterID,
ReporterNodeID: nodeID,
ServiceClass: firstNonEmptyString(command.ServiceClass, FabricServiceClassVPNPackets),
RouteID: command.PrimaryRouteID,
ReplacementRouteID: command.ReplacementRouteID,
RebuildRequestID: command.CommandID,
RebuildStatus: rebuildStatus,
RebuildReason: firstNonEmptyString(command.Reason, command.GuardReason, "service_channel_remediation_rebuild_route_requested"),
DecisionSource: "service_channel_remediation_command",
Outcome: outcome,
Generation: command.ExecutionGeneration,
PolicyFingerprint: command.PoolPolicyFingerprint,
ObservedPolicyFingerprint: command.PoolPolicyFingerprint,
FeedbackReasons: []string{firstNonEmptyString(command.Reason, command.GuardReason, "service_channel_remediation_rebuild_route_requested")},
OldHops: []string{},
ReplacementHops: []string{},
Payload: payload,
})
if err != nil {
return err
}
}
return nil
}
func (s *Service) resolveFabricServiceChannelRemediationRebuildIntents(ctx context.Context, input GetNodeSyntheticMeshConfigInput, commands []FabricServiceChannelAccessRemediationCommand, intents []MeshRouteIntent, feedback map[string]fabricServiceChannelRouteFeedback, generation string, now time.Time) ([]RoutePathDecision, error) {
if len(commands) == 0 {
return nil, nil
}
if now.IsZero() {
now = time.Now().UTC()
}
decisions := []RoutePathDecision{}
for _, command := range commands {
if command.Action != "rebuild_route" || strings.TrimSpace(command.CommandID) == "" || strings.TrimSpace(command.PrimaryRouteID) == "" {
continue
}
lease, leaseOK, err := s.fabricServiceChannelLeaseForRemediationCommand(ctx, input.ClusterID, input.NodeID, command, now)
if err != nil {
return nil, err
}
status := "no_alternate"
outcome := "no_alternate"
reason := "no_unfenced_alternate_route"
var primary SyntheticMeshRouteConfig
var replacement SyntheticMeshRouteConfig
if command.GuardStatus == "rejected" {
status = "deferred_by_policy"
outcome = "deferred_by_policy"
reason = firstNonEmptyString(command.GuardReason, "remediation_guard_rejected")
} else if !command.ExpiresAt.IsZero() && !command.ExpiresAt.After(now.UTC()) {
status = "expired"
outcome = "expired"
reason = "remediation_command_ttl_expired"
} else if !leaseOK {
status = "deferred_by_policy"
outcome = "deferred_by_policy"
reason = "active_lease_not_found_for_rebuild_resolution"
} else {
var ok bool
primary, ok = s.syntheticRouteByID(input, intents, command.PrimaryRouteID)
if !ok {
reason = "primary_route_not_available_for_rebuild"
} else if selected, _, ok := s.selectServiceChannelRouteReplacement(input, primary, intents, feedback); ok {
if guardStatus, guardReason := fabricServiceChannelRouteAllowedByLeasePool(lease, FabricServiceChannelRoute{
RouteID: selected.RouteID,
ClusterID: selected.ClusterID,
ServiceClass: firstNonEmptyString(command.ServiceClass, FabricServiceClassVPNPackets),
SourceNodeID: selected.SourceNodeID,
DestinationNodeID: selected.DestinationNodeID,
Status: "authorized",
}); guardStatus != "allowed" {
status = "deferred_by_policy"
outcome = "deferred_by_policy"
reason = guardReason
} else {
replacement = selected
status = "applied"
outcome = "replacement_selected"
reason = "remediation_rebuild_applied_to_alternate"
}
}
}
feedbackItem := feedback[command.PrimaryRouteID]
feedbackStatus := ""
if feedbackItem.Fenced {
feedbackStatus = "fenced"
} else if feedbackItem.ScoreAdjustment < 0 {
feedbackStatus = "degraded"
} else if feedbackItem.RouteID != "" {
feedbackStatus = "healthy"
}
payload := mustJSONRaw(map[string]any{
"schema_version": "c18z77.service_channel_remediation_rebuild_resolution.v1",
"command_id": command.CommandID,
"channel_id": command.ChannelID,
"resource_id": command.ResourceID,
"entry_node_id": command.EntryNodeID,
"exit_node_id": command.ExitNodeID,
"pool_policy_fingerprint": command.PoolPolicyFingerprint,
"guard_status": command.GuardStatus,
"guard_reason": command.GuardReason,
"resolution_status": status,
"resolution_outcome": outcome,
"resolution_reason": reason,
"resolved_at": now.UTC().Format(time.RFC3339Nano),
})
_, err = s.store.RecordFabricServiceChannelRouteRebuildAttempt(ctx, RecordFabricServiceChannelRouteRebuildAttemptInput{
ClusterID: input.ClusterID,
ReporterNodeID: input.NodeID,
ServiceClass: firstNonEmptyString(command.ServiceClass, FabricServiceClassVPNPackets),
RouteID: command.PrimaryRouteID,
ReplacementRouteID: replacement.RouteID,
RebuildRequestID: command.CommandID,
RebuildStatus: status,
RebuildReason: reason,
DecisionSource: "service_channel_remediation_command",
Outcome: outcome,
Generation: firstNonEmptyString(generation, command.ExecutionGeneration, command.CommandID),
PolicyFingerprint: command.PoolPolicyFingerprint,
ObservedPolicyFingerprint: command.PoolPolicyFingerprint,
FeedbackStatus: feedbackStatus,
FeedbackScoreAdjustment: feedbackItem.ScoreAdjustment,
FeedbackEffectiveScoreAdjustment: feedbackItem.ScoreAdjustment,
FeedbackReasons: append([]string{reason}, feedbackItem.Reasons...),
LastError: feedbackItem.LastError,
ConsecutiveFailures: feedbackItem.ConsecutiveFailures,
StallCount: feedbackItem.StallCount,
LastSendDurationMs: feedbackItem.LastSendDurationMs,
QualityWindowSampleCount: feedbackItem.QualityWindowSampleCount,
QualityWindowFailureCount: feedbackItem.QualityWindowFailureCount,
QualityWindowDropCount: feedbackItem.QualityWindowDropCount,
QualityWindowSlowCount: feedbackItem.QualityWindowSlowCount,
OldHops: append([]string{}, primary.Hops...),
ReplacementHops: append([]string{}, replacement.Hops...),
Payload: payload,
})
if err != nil {
return nil, err
}
if status != "applied" {
continue
}
decision := RoutePathDecision{
DecisionID: command.PrimaryRouteID + "-path-" + input.NodeID + "-service-channel-remediation",
RouteID: command.PrimaryRouteID,
ReplacementRouteID: replacement.RouteID,
RebuildRequestID: command.CommandID,
RebuildStatus: "applied",
RebuildReason: reason,
ClusterID: input.ClusterID,
LocalNodeID: input.NodeID,
SourceNodeID: primary.SourceNodeID,
DestinationNodeID: primary.DestinationNodeID,
OriginalHops: append([]string{}, primary.Hops...),
EffectiveHops: append([]string{}, replacement.Hops...),
DecisionSource: "service_channel_remediation_command",
Generation: firstNonEmptyString(generation, command.CommandID),
PathScore: serviceChannelReplacementRouteScore(replacement),
ScoreReasons: []string{"service_channel_remediation_rebuild_route", "selected_unfenced_alternate_route", "service_channel_rebuild_applied"},
ControlPlaneOnly: true,
ProductionForwarding: false,
ExpiresAt: minNonZeroTime(primary.ExpiresAt, replacement.ExpiresAt, command.ExpiresAt, now.Add(60*time.Second)).UTC(),
}
decision.PreviousHopID, decision.NextHopID, decision.LocalRole = routePathLocalPosition(decision.EffectiveHops, input.NodeID, "", "")
decisions = append(decisions, decision)
}
return decisions, nil
}
func (s *Service) fabricServiceChannelLeaseForRemediationCommand(ctx context.Context, clusterID string, nodeID string, command FabricServiceChannelAccessRemediationCommand, now time.Time) (FabricServiceChannelLease, bool, error) {
records, err := s.store.ListFabricServiceChannelLeases(ctx, ListFabricServiceChannelLeasesInput{
ClusterID: clusterID,
ServiceClass: firstNonEmptyString(command.ServiceClass, FabricServiceClassVPNPackets),
EntryNodeID: nodeID,
ResourceID: command.ResourceID,
IncludeExpired: false,
Limit: 100,
Now: now.UTC(),
})
if err != nil {
return FabricServiceChannelLease{}, false, err
}
for _, record := range records {
if strings.TrimSpace(record.ChannelID) == strings.TrimSpace(command.ChannelID) {
return record.Lease, true, nil
}
}
return FabricServiceChannelLease{}, false, nil
}
func (s *Service) syntheticRouteByID(input GetNodeSyntheticMeshConfigInput, intents []MeshRouteIntent, routeID string) (SyntheticMeshRouteConfig, bool) {
routeID = strings.TrimSpace(routeID)
if routeID == "" {
return SyntheticMeshRouteConfig{}, false
}
for _, intent := range intents {
route, _, _, _, _, ok := s.syntheticRouteFromIntent(input, intent)
if ok && route.RouteID == routeID {
return route, true
}
}
return SyntheticMeshRouteConfig{}, false
}
func minNonZeroTime(items ...time.Time) time.Time {
var out time.Time
for _, item := range items {
if item.IsZero() {
continue
}
if out.IsZero() || item.Before(out) {
out = item
}
}
return out
}
func fabricServiceChannelFirstAuthorizedAlternate(routes []FabricServiceChannelRoute, primaryRouteID string) (FabricServiceChannelRoute, bool) {
for _, route := range routes {
if strings.TrimSpace(route.RouteID) == "" || route.RouteID == primaryRouteID {
continue
}
if route.Status == "authorized" {
return route, true
}
}
return FabricServiceChannelRoute{}, false
}
func fabricServiceChannelRouteAllowedByLeasePool(lease FabricServiceChannelLease, route FabricServiceChannelRoute) (string, string) {
if strings.TrimSpace(route.RouteID) == "" {
return "rejected", "replacement_route_missing"
}
entryAllowed := len(lease.EntryPool) == 0
for _, candidate := range lease.EntryPool {
if candidate.NodeID == route.SourceNodeID {
entryAllowed = true
break
}
}
if !entryAllowed {
return "rejected", "replacement_entry_outside_signed_pool_policy"
}
exitAllowed := len(lease.ExitPool) == 0
for _, candidate := range lease.ExitPool {
if candidate.NodeID == route.DestinationNodeID {
exitAllowed = true
break
}
}
if !exitAllowed {
return "rejected", "replacement_exit_outside_signed_pool_policy"
}
return "allowed", "lease_pool_policy_allows_route"
}
func fabricServiceChannelLeaseSummaryFromRecord(record FabricServiceChannelLeaseRecord, now time.Time) FabricServiceChannelLeaseSummary {
if now.IsZero() {
now = time.Now().UTC()
}
lease := record.Lease
summary := FabricServiceChannelLeaseSummary{
ClusterID: record.ClusterID,
ChannelID: record.ChannelID,
ResourceID: firstNonEmptyString(record.ResourceID, lease.ResourceID),
ServiceClass: firstNonEmptyString(record.ServiceClass, lease.ServiceClass),
Status: lease.Status,
SelectedEntryNodeID: firstNonEmptyString(record.SelectedEntryNodeID, lease.SelectedEntryNodeID),
SelectedExitNodeID: lease.SelectedExitNodeID,
AllowedChannels: append([]string{}, lease.AllowedChannels...),
PrimaryRouteID: strings.TrimSpace(lease.PrimaryRoute.RouteID),
PrimaryRouteStatus: strings.TrimSpace(lease.PrimaryRoute.Status),
DataPlane: lease.DataPlane,
ForceBackendFallback: lease.Status == FabricServiceChannelStatusDegradedFallback || lease.PrimaryRoute.Status == "missing_route_intent",
IssuedAt: lease.IssuedAt,
ExpiresAt: record.ExpiresAt,
CreatedAt: record.CreatedAt,
UpdatedAt: record.UpdatedAt,
}
if summary.ExpiresAt.IsZero() {
summary.ExpiresAt = lease.ExpiresAt
}
summary.Expired = !summary.ExpiresAt.IsZero() && !summary.ExpiresAt.After(now.UTC())
return summary
}
func fabricServiceChannelLeaseCacheKey(clusterID string, channelID string) string {
return strings.TrimSpace(clusterID) + "/" + strings.TrimSpace(channelID)
}
func (s *Service) signFabricServiceChannelLease(ctx context.Context, lease FabricServiceChannelLease) (FabricServiceChannelLease, error) {
authorityKey, err := s.ensureClusterAuthority(ctx, lease.ClusterID, nil)
if err != nil {
return lease, err
}
payload := FabricServiceChannelLeaseAuthorityPayload{
SchemaVersion: "rap.fabric_service_channel_lease_authority.v1",
ChannelID: lease.ChannelID,
ClusterID: lease.ClusterID,
OrganizationID: lease.OrganizationID,
UserID: lease.UserID,
ResourceID: lease.ResourceID,
ServiceClass: lease.ServiceClass,
Status: lease.Status,
SelectedEntryNodeID: lease.SelectedEntryNodeID,
SelectedExitNodeID: lease.SelectedExitNodeID,
EntryPool: append([]FabricServiceChannelNodeCandidate{}, lease.EntryPool...),
ExitPool: append([]FabricServiceChannelNodeCandidate{}, lease.ExitPool...),
AllowedChannels: append([]string{}, lease.AllowedChannels...),
PrimaryRoute: lease.PrimaryRoute,
RecoveryPolicy: lease.RecoveryPolicy,
PoolPolicy: lease.PoolPolicy,
DataPlane: lease.DataPlane,
RouteGeneration: lease.RouteGeneration,
FencingEpoch: lease.FencingEpoch,
TokenHash: fabricServiceChannelTokenHash(lease.Token.Token),
IssuedAt: lease.IssuedAt,
ExpiresAt: lease.ExpiresAt,
}
rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now())
if err != nil {
return lease, err
}
lease.AuthorityPayload = rawPayload
lease.AuthoritySignature = &signature
return lease, nil
}
func fabricServiceChannelTokenHash(token string) string {
sum := sha256.Sum256([]byte(strings.TrimSpace(token)))
return hex.EncodeToString(sum[:])
}
func normalizeFabricServiceClass(value string) string {
return strings.TrimSpace(strings.ToLower(value))
}
func isAllowedFabricServiceClass(value string) bool {
switch value {
case FabricServiceClassVPNPackets,
FabricServiceClassRemoteWorkspace,
FabricServiceClassFileTransfer,
FabricServiceClassVideo:
return true
default:
return false
}
}
func normalizeFabricServiceChannels(channels []string, serviceClass string) []string {
channels = dedupeStrings(channels)
if len(channels) > 0 {
return channels
}
switch serviceClass {
case FabricServiceClassVPNPackets:
return []string{FabricChannelControl, FabricChannelBulk, "vpn_packet"}
case FabricServiceClassRemoteWorkspace:
return []string{FabricChannelControl, FabricChannelInteractive, FabricChannelReliable, FabricChannelDroppable}
case FabricServiceClassVideo:
return []string{FabricChannelControl, FabricChannelInteractive, FabricChannelDroppable}
case FabricServiceClassFileTransfer:
return []string{FabricChannelControl, FabricChannelReliable, FabricChannelBulk}
default:
return []string{FabricChannelControl, FabricChannelReliable}
}
}
func normalizeFabricRequiredRoles(roles []string, serviceClass string) []string {
roles = dedupeStrings(roles)
if len(roles) > 0 {
return roles
}
switch serviceClass {
case FabricServiceClassVPNPackets:
return []string{"entry-node", "vpn-exit"}
case FabricServiceClassRemoteWorkspace:
return []string{"entry-node", "rdp-worker"}
case FabricServiceClassVideo:
return []string{"entry-node", "video-relay"}
case FabricServiceClassFileTransfer:
return []string{"entry-node", "file-storage-cache"}
default:
return []string{"entry-node"}
}
}
func selectFabricServiceChannelPreferredNode(nodeIDs []string, preferred string) string {
preferred = strings.TrimSpace(preferred)
if preferred != "" && containsString(nodeIDs, preferred) {
return preferred
}
if len(nodeIDs) == 0 {
return ""
}
return strings.TrimSpace(nodeIDs[0])
}
func fabricServiceChannelEffectivePool(requested []string, policy []string) []string {
requested = dedupeStrings(requested)
policy = dedupeStrings(policy)
if len(policy) == 0 {
return requested
}
if len(requested) == 0 {
return policy
}
out := []string{}
for _, nodeID := range requested {
if containsString(policy, nodeID) {
out = append(out, nodeID)
}
}
return dedupeStrings(out)
}
func fabricServiceFailoverFromPoolPolicy(policy FabricServiceChannelPoolPolicy) string {
policy = normalizeFabricServiceChannelPoolPolicy(policy, defaultFabricServiceChannelPoolPolicy())
raw, err := json.Marshal(map[string]any{
"route_rebuild": policy.RouteRebuild,
"entry_failover": policy.EntryFailover,
"exit_failover": policy.ExitFailover,
"sticky_session": policy.StickySession,
"backend_fallback_allowed": policy.BackendFallbackAllowed,
"selection_strategy": policy.SelectionStrategy,
"pool_policy_fingerprint": policy.Fingerprint,
})
if err != nil {
return defaultFabricServiceFailover()
}
return string(raw)
}
func fabricServiceChannelNodePool(nodeIDs []string, role string, selected string) []FabricServiceChannelNodeCandidate {
out := make([]FabricServiceChannelNodeCandidate, 0, len(nodeIDs))
for index, nodeID := range nodeIDs {
status := "candidate"
if nodeID == selected {
status = "selected"
}
out = append(out, FabricServiceChannelNodeCandidate{
NodeID: nodeID,
Role: role,
Priority: index + 1,
Status: status,
Metadata: json.RawMessage(`{}`),
})
}
return out
}
type fabricServiceChannelRouteFeedback struct {
RouteID string
ObservationID string
Source string
ChannelID string
ResourceID string
ViolationStatus string
ViolationReason string
Fenced bool
ManualRetry bool
StalePolicy bool
StaleGeneration bool
ProvenanceMissing bool
StaleReason string
ScoreAdjustment int
Reasons []string
LastError string
ConsecutiveFailures int
StallCount int
LastSendDurationMs int64
DegradedFallbackRecommended bool
RouteRebuildRecommended bool
QualityWindowSampleCount int
QualityWindowSuccessCount int
QualityWindowFailureCount int
QualityWindowSlowCount int
QualityWindowDropCount int
ObservedAt time.Time
ExpiresAt time.Time
RetryCooldownUntil *time.Time
}
type fabricServiceChannelRouteProvenance struct {
RouteID string
RouteVersion string
PolicyVersion string
RouteGeneration string
}
func fabricServiceChannelRouteProvenanceFromIntents(intents []MeshRouteIntent) map[string]fabricServiceChannelRouteProvenance {
out := map[string]fabricServiceChannelRouteProvenance{}
for _, intent := range intents {
if strings.TrimSpace(intent.ID) == "" {
continue
}
var policy syntheticRoutePolicy
_ = json.Unmarshal(intent.Policy, &policy)
routeVersion := strings.TrimSpace(policy.RouteVersion)
if routeVersion == "" {
routeVersion = intent.UpdatedAt.UTC().Format(time.RFC3339)
}
policyVersion := strings.TrimSpace(policy.PolicyVersion)
if policyVersion == "" {
policyVersion = routeVersion
}
out[intent.ID] = fabricServiceChannelRouteProvenance{
RouteID: intent.ID,
RouteVersion: routeVersion,
PolicyVersion: policyVersion,
RouteGeneration: policyVersion,
}
}
return out
}
func (s *Service) fabricServiceChannelRouteFeedback(ctx context.Context, clusterID string, entryNodeIDs []string, now time.Time, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) (map[string]fabricServiceChannelRouteFeedback, error) {
out := map[string]fabricServiceChannelRouteFeedback{}
policy = normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy())
for _, nodeID := range dedupeStrings(entryNodeIDs) {
if strings.TrimSpace(nodeID) == "" {
continue
}
observations, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{
ClusterID: clusterID,
ReporterNodeID: nodeID,
ServiceClass: FabricServiceClassVPNPackets,
Now: now,
})
if err != nil {
return nil, err
}
mergeFabricServiceChannelRouteFeedback(out, fabricServiceChannelRouteFeedbackFromObservationsWithProvenance(observations, now, policy, routeProvenance))
expiredObservations, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{
ClusterID: clusterID,
ReporterNodeID: nodeID,
ServiceClass: FabricServiceClassVPNPackets,
IncludeExpired: true,
Now: now,
})
if err != nil {
return nil, err
}
mergeFabricServiceChannelRouteFeedback(out, fabricServiceChannelManualRetryFeedbackFromObservationsWithProvenance(expiredObservations, now, policy, routeProvenance))
if len(observations) > 0 {
continue
}
heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 1)
if err != nil {
return nil, err
}
if len(heartbeats) == 0 || now.Sub(heartbeats[0].ObservedAt.UTC()) > fabricServiceChannelFeedbackMaxAge {
continue
}
mergeFabricServiceChannelRouteFeedback(out, fabricServiceChannelRouteFeedbackFromHeartbeatWithProvenance(heartbeats[0], now, policy, routeProvenance))
}
return out, nil
}
func (s *Service) fabricServiceChannelRecoveryPolicy(ctx context.Context, clusterID string) FabricServiceChannelRecoveryPolicy {
cluster, err := s.store.GetCluster(ctx, strings.TrimSpace(clusterID))
if err != nil {
return defaultFabricServiceChannelRecoveryPolicy()
}
return fabricServiceChannelRecoveryPolicyFromCluster(cluster)
}
func (s *Service) recordFabricServiceChannelRouteFeedback(ctx context.Context, heartbeat NodeHeartbeat) error {
if strings.TrimSpace(heartbeat.ClusterID) == "" || strings.TrimSpace(heartbeat.NodeID) == "" {
return nil
}
observedAt := heartbeat.ObservedAt.UTC()
if observedAt.IsZero() {
observedAt = s.now().UTC()
}
expiresAt := observedAt.Add(fabricServiceChannelFeedbackMaxAge)
for _, input := range fabricServiceChannelRouteFeedbackInputsFromHeartbeat(heartbeat, FabricServiceClassVPNPackets, expiresAt) {
if _, err := s.store.RecordFabricServiceChannelRouteFeedback(ctx, input); err != nil {
return err
}
}
for _, input := range s.fabricServiceChannelRouteFeedbackInputsFromAccessReport(ctx, heartbeat, FabricServiceClassVPNPackets, expiresAt) {
if _, err := s.store.RecordFabricServiceChannelRouteFeedback(ctx, input); err != nil {
return err
}
}
return nil
}
func (s *Service) fabricServiceChannelRouteFeedbackInputsFromAccessReport(ctx context.Context, heartbeat NodeHeartbeat, serviceClass string, expiresAt time.Time) []RecordFabricServiceChannelRouteFeedbackInput {
if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) {
return nil
}
report := jsonMapPath(jsonObject(heartbeat.Metadata), "fabric_service_channel_access_report")
if len(report) == 0 {
return nil
}
if jsonInt(report, "fabric_route_send_failure") <= 0 {
return nil
}
status := jsonString(report, "last_data_plane_violation_status")
if status != "fabric_route_send_failed_backend_fallback_blocked" {
return nil
}
observedAt := heartbeat.ObservedAt.UTC()
if observedAt.IsZero() {
observedAt = time.Now().UTC()
}
records, err := s.store.ListFabricServiceChannelLeases(ctx, ListFabricServiceChannelLeasesInput{
ClusterID: heartbeat.ClusterID,
EntryNodeID: heartbeat.NodeID,
ServiceClass: serviceClass,
IncludeExpired: false,
Limit: 100,
Now: observedAt,
})
if err != nil || len(records) == 0 {
return nil
}
reason := firstNonEmptyString(jsonString(report, "last_data_plane_violation_reason"), "fabric_route_send_failed_backend_fallback_blocked")
out := make([]RecordFabricServiceChannelRouteFeedbackInput, 0, len(records))
for _, record := range records {
summary := fabricServiceChannelLeaseSummaryFromRecord(record, observedAt)
routeID := strings.TrimSpace(summary.PrimaryRouteID)
if summary.Expired || routeID == "" || summary.ForceBackendFallback {
continue
}
if s.fabricServiceChannelHasActiveAccessReportRouteFeedback(ctx, heartbeat.ClusterID, heartbeat.NodeID, routeID, serviceClass, observedAt) {
continue
}
out = append(out, RecordFabricServiceChannelRouteFeedbackInput{
ClusterID: heartbeat.ClusterID,
ReporterNodeID: heartbeat.NodeID,
RouteID: routeID,
ServiceClass: serviceClass,
FeedbackStatus: "fenced",
ScoreAdjustment: -1030,
Reasons: []string{"service_channel_route_rebuild_recommended", "data_plane_fabric_route_send_failed", "backend_fallback_blocked_by_policy"},
LastError: reason,
ConsecutiveFailures: maxInt(1, jsonInt(report, "fabric_route_send_failure")),
Payload: mustJSONRaw(map[string]any{
"source": "fabric_service_channel_access_report",
"channel_id": summary.ChannelID,
"resource_id": summary.ResourceID,
"last_data_plane_violation_status": status,
"last_data_plane_violation_reason": reason,
"backend_fallback_blocked": jsonInt(report, "backend_fallback_blocked"),
"fabric_route_send_failure": jsonInt(report, "fabric_route_send_failure"),
"last_backend_relay_policy": jsonString(report, "last_backend_relay_policy"),
"last_working_data_transport": jsonString(report, "last_working_data_transport"),
"last_steady_state_transport": jsonString(report, "last_steady_state_transport"),
}),
ObservedAt: observedAt,
ExpiresAt: expiresAt,
})
}
return out
}
func (s *Service) fabricServiceChannelHasActiveAccessReportRouteFeedback(ctx context.Context, clusterID, reporterNodeID, routeID, serviceClass string, observedAt time.Time) bool {
observations, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{
ClusterID: clusterID,
ReporterNodeID: reporterNodeID,
RouteID: routeID,
ServiceClass: serviceClass,
IncludeExpired: false,
Now: observedAt,
})
if err != nil {
return false
}
for _, observation := range observations {
if observation.FeedbackStatus != "fenced" && observation.FeedbackStatus != "degraded" {
continue
}
if containsString(observation.Reasons, "data_plane_fabric_route_send_failed") ||
jsonString(jsonObject(observation.Payload), "source") == "fabric_service_channel_access_report" {
return true
}
}
return false
}
type fabricServiceChannelRuntimeHeartbeat struct {
SchemaVersion string `json:"schema_version"`
ConfigVersion string `json:"config_version"`
Ingress struct {
FlowScheduler struct {
ChannelStats map[string]fabricServiceChannelRuntimeChannelStat `json:"channel_stats"`
} `json:"flow_scheduler"`
} `json:"ingress"`
}
type fabricServiceChannelRuntimeChannelStat struct {
LastRouteID string `json:"last_route_id"`
RoutePolicyVersion string `json:"route_policy_version,omitempty"`
RouteGeneration string `json:"route_generation,omitempty"`
RecoveryPolicyFingerprint string `json:"recovery_policy_fingerprint,omitempty"`
LastFailedRouteID string `json:"last_failed_route_id"`
LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"`
LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"`
LastError string `json:"last_error"`
ConsecutiveFailures int `json:"consecutive_failures"`
StallCount int `json:"stall_count"`
LastSendDurationMillis int64 `json:"last_send_duration_ms"`
RouteRebuildRecommended bool `json:"route_rebuild_recommended"`
DegradedFallbackRecommended bool `json:"degraded_fallback_recommended"`
QualityWindowSampleCount int `json:"quality_window_sample_count"`
QualityWindowSuccessCount int `json:"quality_window_success_count"`
QualityWindowFailureCount int `json:"quality_window_failure_count"`
QualityWindowSlowCount int `json:"quality_window_slow_count"`
QualityWindowDropCount int `json:"quality_window_drop_count"`
QualityWindowAvgLatencyMs int64 `json:"quality_window_avg_latency_ms"`
QualityWindowLastUpdatedAt string `json:"quality_window_last_updated_at"`
}
func fabricServiceChannelRouteFeedbackFromHeartbeat(heartbeat NodeHeartbeat, now time.Time) map[string]fabricServiceChannelRouteFeedback {
return fabricServiceChannelRouteFeedbackFromHeartbeatWithProvenance(heartbeat, now, defaultFabricServiceChannelRecoveryPolicy(), nil)
}
func fabricServiceChannelRouteFeedbackFromHeartbeatWithProvenance(heartbeat NodeHeartbeat, now time.Time, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) map[string]fabricServiceChannelRouteFeedback {
out := map[string]fabricServiceChannelRouteFeedback{}
for _, input := range fabricServiceChannelRouteFeedbackInputsFromHeartbeat(heartbeat, FabricServiceClassVPNPackets, now.Add(fabricServiceChannelFeedbackMaxAge)) {
observation := fabricServiceChannelAnnotateFeedbackProvenance(FabricServiceChannelRouteFeedbackObservation{
ClusterID: input.ClusterID,
ReporterNodeID: input.ReporterNodeID,
RouteID: input.RouteID,
ServiceClass: input.ServiceClass,
FeedbackStatus: input.FeedbackStatus,
ScoreAdjustment: input.ScoreAdjustment,
Reasons: append([]string{}, input.Reasons...),
LastError: input.LastError,
ConsecutiveFailures: input.ConsecutiveFailures,
StallCount: input.StallCount,
LastSendDurationMs: input.LastSendDurationMs,
Payload: input.Payload,
ObservedAt: input.ObservedAt,
ExpiresAt: input.ExpiresAt,
}, policy, routeProvenance)
scoreAdjustment := input.ScoreAdjustment
fenced := input.FeedbackStatus == "fenced"
routeRebuildRecommended := containsString(input.Reasons, "service_channel_route_rebuild_recommended")
degradedFallbackRecommended := containsString(input.Reasons, "service_channel_degraded_fallback_recommended")
if observation.StalePolicy || observation.StaleGeneration {
scoreAdjustment = fabricServiceChannelConservativeStaleScore(scoreAdjustment)
fenced = false
routeRebuildRecommended = false
degradedFallbackRecommended = false
}
item := fabricServiceChannelRouteFeedback{
RouteID: input.RouteID,
Fenced: fenced,
StalePolicy: observation.StalePolicy,
StaleGeneration: observation.StaleGeneration,
ProvenanceMissing: observation.ProvenanceMissing,
StaleReason: observation.StaleReason,
ScoreAdjustment: scoreAdjustment,
Reasons: observation.Reasons,
LastError: input.LastError,
ConsecutiveFailures: input.ConsecutiveFailures,
StallCount: input.StallCount,
LastSendDurationMs: input.LastSendDurationMs,
DegradedFallbackRecommended: degradedFallbackRecommended,
RouteRebuildRecommended: routeRebuildRecommended,
QualityWindowSampleCount: fabricServiceChannelFeedbackPayloadInt(input.Payload, "quality_window_sample_count"),
QualityWindowSuccessCount: fabricServiceChannelFeedbackPayloadInt(input.Payload, "quality_window_success_count"),
QualityWindowFailureCount: fabricServiceChannelFeedbackPayloadInt(input.Payload, "quality_window_failure_count"),
QualityWindowSlowCount: fabricServiceChannelFeedbackPayloadInt(input.Payload, "quality_window_slow_count"),
QualityWindowDropCount: fabricServiceChannelFeedbackPayloadInt(input.Payload, "quality_window_drop_count"),
ObservedAt: input.ObservedAt,
}
out[input.RouteID] = item
}
return out
}
func fabricServiceChannelRouteFeedbackInputsFromHeartbeat(heartbeat NodeHeartbeat, serviceClass string, expiresAt time.Time) []RecordFabricServiceChannelRouteFeedbackInput {
if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) {
return nil
}
var metadata struct {
Report fabricServiceChannelRuntimeHeartbeat `json:"fabric_service_channel_runtime_report"`
}
if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil {
return nil
}
if metadata.Report.SchemaVersion == "" || len(metadata.Report.Ingress.FlowScheduler.ChannelStats) == 0 {
return nil
}
observedAt := heartbeat.ObservedAt.UTC()
if observedAt.IsZero() {
observedAt = time.Now().UTC()
}
var out []RecordFabricServiceChannelRouteFeedbackInput
for _, stat := range metadata.Report.Ingress.FlowScheduler.ChannelStats {
failedRouteID := strings.TrimSpace(stat.LastFailedRouteID)
rollingFailureCount := fabricServiceChannelRollingFailureCount(stat)
rollingStallCount := fabricServiceChannelRollingStallCount(stat)
rollingLatencyMs := fabricServiceChannelRollingLatencyMs(stat)
rollingWindowActive := stat.QualityWindowSampleCount > 0
freshFailureActive := failedRouteID != "" && (!rollingWindowActive || rollingFailureCount > 0)
if freshFailureActive {
scoreAdjustment := -30
reasons := []string{"service_channel_recent_route_failure"}
if rollingWindowActive {
reasons = append(reasons, "service_channel_rolling_quality_window")
}
status := "degraded"
if stat.RouteRebuildRecommended || stat.DegradedFallbackRecommended || rollingFailureCount >= 2 {
status = "fenced"
scoreAdjustment -= 1000
reasons = append(reasons, "service_channel_route_rebuild_recommended")
if stat.DegradedFallbackRecommended {
reasons = append(reasons, "service_channel_degraded_fallback_recommended")
}
}
out = append(out, RecordFabricServiceChannelRouteFeedbackInput{
ClusterID: heartbeat.ClusterID,
ReporterNodeID: heartbeat.NodeID,
RouteID: failedRouteID,
ServiceClass: serviceClass,
FeedbackStatus: status,
ScoreAdjustment: scoreAdjustment,
Reasons: dedupeStrings(reasons),
LastError: strings.TrimSpace(stat.LastError),
ConsecutiveFailures: rollingFailureCount,
StallCount: rollingStallCount,
LastSendDurationMs: rollingLatencyMs,
Payload: fabricServiceChannelFeedbackPayload(stat, metadata.Report.ConfigVersion),
ObservedAt: observedAt,
ExpiresAt: expiresAt,
})
}
successRouteID := strings.TrimSpace(stat.LastRouteID)
if successRouteID != "" && (!freshFailureActive || successRouteID != failedRouteID) && fabricServiceChannelStatHasFreshSuccess(stat) {
qualityAdjustment, qualityReasons := fabricServiceChannelRouteQualityScore(rollingLatencyMs, rollingFailureCount, rollingStallCount)
reasons := append([]string{"service_channel_recent_success"}, qualityReasons...)
if rollingWindowActive {
reasons = append(reasons, "service_channel_rolling_quality_window")
}
out = append(out, RecordFabricServiceChannelRouteFeedbackInput{
ClusterID: heartbeat.ClusterID,
ReporterNodeID: heartbeat.NodeID,
RouteID: successRouteID,
ServiceClass: serviceClass,
FeedbackStatus: "healthy",
ScoreAdjustment: 10 + qualityAdjustment,
Reasons: dedupeStrings(reasons),
ConsecutiveFailures: rollingFailureCount,
StallCount: rollingStallCount,
LastSendDurationMs: rollingLatencyMs,
Payload: fabricServiceChannelFeedbackPayload(stat, metadata.Report.ConfigVersion),
ObservedAt: observedAt,
ExpiresAt: expiresAt,
})
}
}
return out
}
func fabricServiceChannelFeedbackPayload(stat fabricServiceChannelRuntimeChannelStat, configVersion string) json.RawMessage {
payload := map[string]any{}
rawStat, err := json.Marshal(stat)
if err == nil {
_ = json.Unmarshal(rawStat, &payload)
}
if strings.TrimSpace(configVersion) != "" {
payload["observed_config_version"] = strings.TrimSpace(configVersion)
}
raw, err := json.Marshal(payload)
if err != nil {
return json.RawMessage(`{}`)
}
return raw
}
func fabricServiceChannelStatHasFreshSuccess(stat fabricServiceChannelRuntimeChannelStat) bool {
if stat.QualityWindowSampleCount <= 0 {
return !stat.RouteRebuildRecommended && !stat.DegradedFallbackRecommended
}
return stat.QualityWindowSuccessCount > 0 && stat.QualityWindowFailureCount == 0 && stat.QualityWindowDropCount == 0
}
func fabricServiceChannelFlowSchedulerFromHeartbeat(heartbeat NodeHeartbeat) map[string]any {
if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) {
return map[string]any{}
}
metadata := jsonObject(heartbeat.Metadata)
return jsonMapPath(metadata, "fabric_service_channel_runtime_report", "ingress", "flow_scheduler")
}
func fabricServiceChannelRollingFailureCount(stat fabricServiceChannelRuntimeChannelStat) int {
if stat.QualityWindowSampleCount <= 0 {
return stat.ConsecutiveFailures
}
return stat.QualityWindowFailureCount + stat.QualityWindowDropCount
}
func fabricServiceChannelRollingStallCount(stat fabricServiceChannelRuntimeChannelStat) int {
if stat.QualityWindowSampleCount <= 0 {
return stat.StallCount
}
return stat.QualityWindowSlowCount
}
func fabricServiceChannelRollingLatencyMs(stat fabricServiceChannelRuntimeChannelStat) int64 {
if stat.QualityWindowSampleCount > 0 && stat.QualityWindowAvgLatencyMs > 0 {
return stat.QualityWindowAvgLatencyMs
}
return stat.LastSendDurationMillis
}
func fabricServiceChannelRouteQualityScore(lastSendDurationMs int64, consecutiveFailures int, stallCount int) (int, []string) {
score := 0
reasons := []string{}
switch {
case lastSendDurationMs <= 0:
case lastSendDurationMs <= 10:
score += 80
reasons = append(reasons, "service_channel_quality_latency_le_10ms")
case lastSendDurationMs <= 25:
score += 60
reasons = append(reasons, "service_channel_quality_latency_le_25ms")
case lastSendDurationMs <= 50:
score += 40
reasons = append(reasons, "service_channel_quality_latency_le_50ms")
case lastSendDurationMs <= 100:
score += 20
reasons = append(reasons, "service_channel_quality_latency_le_100ms")
case lastSendDurationMs <= 250:
score += 5
reasons = append(reasons, "service_channel_quality_latency_le_250ms")
case lastSendDurationMs <= 500:
score -= 10
reasons = append(reasons, "service_channel_quality_latency_slow")
case lastSendDurationMs <= 1000:
score -= 30
reasons = append(reasons, "service_channel_quality_latency_very_slow")
default:
score -= 60
reasons = append(reasons, "service_channel_quality_latency_unhealthy")
}
if consecutiveFailures > 0 {
penalty := consecutiveFailures * 20
if penalty > 100 {
penalty = 100
}
score -= penalty
reasons = append(reasons, "service_channel_quality_recent_failures")
}
if stallCount > 0 {
penalty := stallCount * 5
if penalty > 50 {
penalty = 50
}
score -= penalty
reasons = append(reasons, "service_channel_quality_recent_stalls")
}
return score, dedupeStrings(reasons)
}
func fabricServiceChannelRetryCooldownUntil(payload json.RawMessage) *time.Time {
if len(payload) == 0 || !json.Valid(payload) {
return nil
}
var raw map[string]any
if err := json.Unmarshal(payload, &raw); err != nil {
return nil
}
value, ok := raw["operator_retry_cooldown_until"].(string)
if !ok || strings.TrimSpace(value) == "" {
return nil
}
parsed, err := time.Parse(time.RFC3339Nano, strings.TrimSpace(value))
if err != nil {
return nil
}
parsed = parsed.UTC()
return &parsed
}
func fabricServiceChannelFeedbackPayloadBool(payload json.RawMessage, key string) bool {
if len(payload) == 0 || !json.Valid(payload) {
return false
}
var raw map[string]any
if err := json.Unmarshal(payload, &raw); err != nil {
return false
}
value, ok := raw[key].(bool)
return ok && value
}
func fabricServiceChannelFeedbackPayloadInt(payload json.RawMessage, key string) int {
if len(payload) == 0 || !json.Valid(payload) {
return 0
}
var raw map[string]any
if err := json.Unmarshal(payload, &raw); err != nil {
return 0
}
switch value := raw[key].(type) {
case float64:
return int(value)
case int:
return value
case json.Number:
parsed, _ := value.Int64()
return int(parsed)
default:
return 0
}
}
func fabricServiceChannelFeedbackPayloadString(payload json.RawMessage, keys ...string) string {
if len(payload) == 0 || !json.Valid(payload) {
return ""
}
var raw map[string]any
if err := json.Unmarshal(payload, &raw); err != nil {
return ""
}
for _, key := range keys {
if value, ok := raw[key].(string); ok && strings.TrimSpace(value) != "" {
return strings.TrimSpace(value)
}
}
if nested, ok := raw["recovery_policy"].(map[string]any); ok {
for _, key := range keys {
if value, ok := nested[key].(string); ok && strings.TrimSpace(value) != "" {
return strings.TrimSpace(value)
}
}
}
return ""
}
func fabricServiceChannelAnnotateFeedbackProvenance(observation FabricServiceChannelRouteFeedbackObservation, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) FabricServiceChannelRouteFeedbackObservation {
policy = normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy())
observation.EffectivePolicyFingerprint = policy.Fingerprint
observation.ObservedPolicyFingerprint = fabricServiceChannelFeedbackPayloadString(observation.Payload, "recovery_policy_fingerprint", "policy_fingerprint", "fingerprint")
provenance := routeProvenance[observation.RouteID]
observation.EffectiveRouteGeneration = provenance.RouteGeneration
observation.ObservedRouteGeneration = fabricServiceChannelFeedbackPayloadString(observation.Payload, "route_generation", "route_policy_version", "policy_version")
missingPolicy := observation.ObservedPolicyFingerprint == ""
missingGeneration := observation.ObservedRouteGeneration == "" && observation.EffectiveRouteGeneration != ""
observation.ProvenanceMissing = missingPolicy || missingGeneration
if observation.ObservedPolicyFingerprint != "" && policy.Fingerprint != "" && observation.ObservedPolicyFingerprint != policy.Fingerprint {
observation.StalePolicy = true
}
if observation.ObservedRouteGeneration != "" && observation.EffectiveRouteGeneration != "" && observation.ObservedRouteGeneration != observation.EffectiveRouteGeneration {
observation.StaleGeneration = true
}
switch {
case observation.StalePolicy && observation.StaleGeneration:
observation.StaleReason = "service_channel_feedback_stale_policy_and_generation"
case observation.StalePolicy:
observation.StaleReason = "service_channel_feedback_stale_policy"
case observation.StaleGeneration:
observation.StaleReason = "service_channel_feedback_stale_generation"
case observation.ProvenanceMissing:
observation.StaleReason = "service_channel_feedback_provenance_missing"
}
if observation.StaleReason != "" {
observation.Reasons = dedupeStrings(append(observation.Reasons, observation.StaleReason))
}
return observation
}
func fabricServiceChannelConservativeStaleScore(score int) int {
if score > 0 {
return 0
}
if score < -10 {
return -10
}
return score
}
func fabricServiceChannelFeedbackSuppressedByOperatorCooldown(input RecordFabricServiceChannelRouteFeedbackInput, cooldownUntil, observedAt time.Time) RecordFabricServiceChannelRouteFeedbackInput {
originalStatus := input.FeedbackStatus
originalScore := input.ScoreAdjustment
payload := map[string]any{}
if len(input.Payload) > 0 && json.Valid(input.Payload) {
_ = json.Unmarshal(input.Payload, &payload)
}
payload["operator_feedback_suppressed"] = true
payload["operator_suppressed_feedback_status"] = originalStatus
payload["operator_suppressed_score_adjustment"] = originalScore
payload["operator_retry_cooldown_until"] = cooldownUntil.UTC().Format(time.RFC3339Nano)
payload["operator_suppressed_at"] = observedAt.UTC().Format(time.RFC3339Nano)
raw, err := json.Marshal(payload)
if err != nil {
raw = []byte(`{}`)
}
input.FeedbackStatus = "operator_retry_cooldown"
input.ScoreAdjustment = 0
input.Reasons = dedupeStrings(append(input.Reasons, "operator_expired_feedback_retry", "manual_feedback_expired_retry_cooldown", "service_channel_feedback_suppressed_by_operator_expire"))
input.Payload = raw
input.ExpiresAt = cooldownUntil.UTC()
return input
}
func fabricServiceChannelRouteFeedbackFromObservations(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time) map[string]fabricServiceChannelRouteFeedback {
return fabricServiceChannelRouteFeedbackFromObservationsWithProvenance(observations, now, defaultFabricServiceChannelRecoveryPolicy(), nil)
}
func fabricServiceChannelRouteFeedbackFromObservationsWithProvenance(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) map[string]fabricServiceChannelRouteFeedback {
out := map[string]fabricServiceChannelRouteFeedback{}
for _, observation := range observations {
observation = fabricServiceChannelAnnotateFeedbackProvenance(observation, policy, routeProvenance)
if strings.TrimSpace(observation.RouteID) == "" ||
(!observation.ExpiresAt.IsZero() && !observation.ExpiresAt.After(now.UTC())) {
continue
}
item := out[observation.RouteID]
item.RouteID = observation.RouteID
stale := observation.StalePolicy || observation.StaleGeneration
item.StalePolicy = item.StalePolicy || observation.StalePolicy
item.StaleGeneration = item.StaleGeneration || observation.StaleGeneration
item.ProvenanceMissing = item.ProvenanceMissing || observation.ProvenanceMissing
if observation.StaleReason != "" {
item.StaleReason = observation.StaleReason
}
item.Fenced = item.Fenced || (!stale && observation.FeedbackStatus == "fenced")
if observation.RetryCooldownUntil != nil && observation.RetryCooldownUntil.After(now.UTC()) {
item.ManualRetry = true
}
scoreAdjustment, ageDecayReasons := fabricServiceChannelFeedbackScoreWithAgeDecay(observation, now)
if stale {
scoreAdjustment = fabricServiceChannelConservativeStaleScore(scoreAdjustment)
}
item.ScoreAdjustment += scoreAdjustment
item.Reasons = append(item.Reasons, observation.Reasons...)
item.Reasons = append(item.Reasons, ageDecayReasons...)
if observation.LastSendDurationMs > 0 && (item.LastSendDurationMs == 0 || observation.LastSendDurationMs < item.LastSendDurationMs) {
item.LastSendDurationMs = observation.LastSendDurationMs
}
if observation.ConsecutiveFailures > item.ConsecutiveFailures {
item.ConsecutiveFailures = observation.ConsecutiveFailures
}
if observation.StallCount > item.StallCount {
item.StallCount = observation.StallCount
}
item.DegradedFallbackRecommended = item.DegradedFallbackRecommended || (!stale &&
(containsString(observation.Reasons, "service_channel_degraded_fallback_recommended") ||
fabricServiceChannelFeedbackPayloadBool(observation.Payload, "degraded_fallback_recommended")))
item.RouteRebuildRecommended = item.RouteRebuildRecommended || (!stale &&
(containsString(observation.Reasons, "service_channel_route_rebuild_recommended") ||
fabricServiceChannelFeedbackPayloadBool(observation.Payload, "route_rebuild_recommended")))
if sampleCount := fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_sample_count"); sampleCount > item.QualityWindowSampleCount {
item.QualityWindowSampleCount = sampleCount
}
if successCount := fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_success_count"); successCount > item.QualityWindowSuccessCount {
item.QualityWindowSuccessCount = successCount
}
if failureCount := fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_failure_count"); failureCount > item.QualityWindowFailureCount {
item.QualityWindowFailureCount = failureCount
}
if slowCount := fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_slow_count"); slowCount > item.QualityWindowSlowCount {
item.QualityWindowSlowCount = slowCount
}
if dropCount := fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_drop_count"); dropCount > item.QualityWindowDropCount {
item.QualityWindowDropCount = dropCount
}
if observation.LastError != "" {
item.LastError = observation.LastError
}
if observation.ObservedAt.After(item.ObservedAt) {
item.ObservedAt = observation.ObservedAt
item.ExpiresAt = observation.ExpiresAt
item.ObservationID = observation.ID
item.Source = jsonString(jsonObject(observation.Payload), "source")
item.ChannelID = jsonString(jsonObject(observation.Payload), "channel_id")
item.ResourceID = jsonString(jsonObject(observation.Payload), "resource_id")
item.ViolationStatus = jsonString(jsonObject(observation.Payload), "last_data_plane_violation_status")
item.ViolationReason = jsonString(jsonObject(observation.Payload), "last_data_plane_violation_reason")
}
if observation.RetryCooldownUntil != nil && (item.RetryCooldownUntil == nil || observation.RetryCooldownUntil.After(*item.RetryCooldownUntil)) {
cooldown := observation.RetryCooldownUntil.UTC()
item.RetryCooldownUntil = &cooldown
}
out[observation.RouteID] = item
}
for routeID, item := range out {
item.Reasons = dedupeStrings(item.Reasons)
out[routeID] = item
}
return out
}
func fabricServiceChannelManualRetryFeedbackFromObservations(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time) map[string]fabricServiceChannelRouteFeedback {
return fabricServiceChannelManualRetryFeedbackFromObservationsWithProvenance(observations, now, defaultFabricServiceChannelRecoveryPolicy(), nil)
}
func fabricServiceChannelManualRetryFeedbackFromObservationsWithProvenance(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) map[string]fabricServiceChannelRouteFeedback {
out := map[string]fabricServiceChannelRouteFeedback{}
now = now.UTC()
for _, observation := range observations {
observation = fabricServiceChannelAnnotateFeedbackProvenance(observation, policy, routeProvenance)
if strings.TrimSpace(observation.RouteID) == "" || observation.RetryCooldownUntil == nil || !observation.RetryCooldownUntil.After(now) {
continue
}
if observation.FeedbackStatus == "healthy" {
continue
}
item := out[observation.RouteID]
item.RouteID = observation.RouteID
item.ManualRetry = true
item.StalePolicy = item.StalePolicy || observation.StalePolicy
item.StaleGeneration = item.StaleGeneration || observation.StaleGeneration
item.ProvenanceMissing = item.ProvenanceMissing || observation.ProvenanceMissing
if observation.StaleReason != "" {
item.StaleReason = observation.StaleReason
}
item.ScoreAdjustment += 0
item.Reasons = append(item.Reasons, "operator_expired_feedback_retry", "manual_feedback_expired_retry_cooldown")
if observation.LastError != "" {
item.LastError = observation.LastError
}
if observation.ObservedAt.After(item.ObservedAt) {
item.ObservedAt = observation.ObservedAt
}
cooldown := observation.RetryCooldownUntil.UTC()
if item.RetryCooldownUntil == nil || cooldown.After(*item.RetryCooldownUntil) {
item.RetryCooldownUntil = &cooldown
}
out[observation.RouteID] = item
}
for routeID, item := range out {
item.Reasons = dedupeStrings(item.Reasons)
out[routeID] = item
}
return out
}
func fabricServiceChannelFeedbackScoreWithAgeDecay(observation FabricServiceChannelRouteFeedbackObservation, now time.Time) (int, []string) {
score := observation.ScoreAdjustment
if score <= 0 || observation.FeedbackStatus != "healthy" || observation.ObservedAt.IsZero() {
return score, nil
}
observedAt := observation.ObservedAt.UTC()
now = now.UTC()
if !now.After(observedAt) {
return score, nil
}
maxAge := fabricServiceChannelFeedbackMaxAge
if !observation.ExpiresAt.IsZero() && observation.ExpiresAt.After(observedAt) {
maxAge = observation.ExpiresAt.Sub(observedAt)
}
if maxAge <= 0 {
return 0, []string{"service_channel_feedback_age_decay_expired"}
}
age := now.Sub(observedAt)
if age <= 0 {
return score, nil
}
if age >= maxAge {
return 0, []string{"service_channel_feedback_age_decay_expired"}
}
remaining := maxAge - age
decayed := int((int64(score)*int64(remaining) + int64(maxAge) - 1) / int64(maxAge))
if decayed < 1 {
decayed = 1
}
if decayed == score {
return score, nil
}
return decayed, []string{"service_channel_feedback_age_decay"}
}
func mergeFabricServiceChannelRouteFeedback(dst map[string]fabricServiceChannelRouteFeedback, src map[string]fabricServiceChannelRouteFeedback) {
for routeID, incoming := range src {
existing := dst[routeID]
existing.RouteID = routeID
existing.Fenced = existing.Fenced || incoming.Fenced
existing.ManualRetry = existing.ManualRetry || incoming.ManualRetry
existing.StalePolicy = existing.StalePolicy || incoming.StalePolicy
existing.StaleGeneration = existing.StaleGeneration || incoming.StaleGeneration
existing.ProvenanceMissing = existing.ProvenanceMissing || incoming.ProvenanceMissing
if incoming.StaleReason != "" {
existing.StaleReason = incoming.StaleReason
}
existing.ScoreAdjustment += incoming.ScoreAdjustment
existing.Reasons = dedupeStrings(append(existing.Reasons, incoming.Reasons...))
if incoming.ConsecutiveFailures > existing.ConsecutiveFailures {
existing.ConsecutiveFailures = incoming.ConsecutiveFailures
}
if incoming.StallCount > existing.StallCount {
existing.StallCount = incoming.StallCount
}
if incoming.LastSendDurationMs > 0 && (existing.LastSendDurationMs == 0 || incoming.LastSendDurationMs < existing.LastSendDurationMs) {
existing.LastSendDurationMs = incoming.LastSendDurationMs
}
existing.DegradedFallbackRecommended = existing.DegradedFallbackRecommended || incoming.DegradedFallbackRecommended
existing.RouteRebuildRecommended = existing.RouteRebuildRecommended || incoming.RouteRebuildRecommended
if incoming.QualityWindowSampleCount > existing.QualityWindowSampleCount {
existing.QualityWindowSampleCount = incoming.QualityWindowSampleCount
}
if incoming.QualityWindowSuccessCount > existing.QualityWindowSuccessCount {
existing.QualityWindowSuccessCount = incoming.QualityWindowSuccessCount
}
if incoming.QualityWindowFailureCount > existing.QualityWindowFailureCount {
existing.QualityWindowFailureCount = incoming.QualityWindowFailureCount
}
if incoming.QualityWindowSlowCount > existing.QualityWindowSlowCount {
existing.QualityWindowSlowCount = incoming.QualityWindowSlowCount
}
if incoming.QualityWindowDropCount > existing.QualityWindowDropCount {
existing.QualityWindowDropCount = incoming.QualityWindowDropCount
}
if incoming.LastError != "" {
existing.LastError = incoming.LastError
}
if incoming.ObservedAt.After(existing.ObservedAt) {
existing.ObservedAt = incoming.ObservedAt
}
if incoming.RetryCooldownUntil != nil && (existing.RetryCooldownUntil == nil || incoming.RetryCooldownUntil.After(*existing.RetryCooldownUntil)) {
cooldown := incoming.RetryCooldownUntil.UTC()
existing.RetryCooldownUntil = &cooldown
}
dst[routeID] = existing
}
}
func serviceChannelRouteFeedbackReport(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time) *FabricServiceChannelRouteFeedbackReport {
return serviceChannelRouteFeedbackReportWithPolicy(observations, now, defaultFabricServiceChannelRecoveryPolicy())
}
func serviceChannelRouteFeedbackReportWithPolicy(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy) *FabricServiceChannelRouteFeedbackReport {
return serviceChannelRouteFeedbackReportWithPolicyAndProvenance(observations, now, policy, nil)
}
func serviceChannelRouteFeedbackReportWithPolicyAndProvenance(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) *FabricServiceChannelRouteFeedbackReport {
policy = normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy())
reportObservations := make([]FabricServiceChannelRouteFeedbackObservation, 0, len(observations))
for _, observation := range observations {
observation = fabricServiceChannelAnnotateFeedbackProvenance(observation, policy, routeProvenance)
effectiveScore, ageDecayReasons := fabricServiceChannelFeedbackScoreWithAgeDecay(observation, now)
if observation.StalePolicy || observation.StaleGeneration {
effectiveScore = fabricServiceChannelConservativeStaleScore(effectiveScore)
}
observation.EffectiveScoreAdjustment = effectiveScore
observation.Reasons = dedupeStrings(append(observation.Reasons, ageDecayReasons...))
observation.RecoveryState = fabricServiceChannelFeedbackObservationRecoveryState(observation, now)
observation.RecoveryPromoted = fabricServiceChannelFeedbackObservationRecoveryPromoted(observation, now, policy)
if observation.RecoveryPromoted {
observation.RecoveryState = "healthy"
}
observation.RecoveryDemoted, observation.RecoveryReason = fabricServiceChannelFeedbackObservationRecoveryDemotion(observation, now, policy)
observation.RecoveryHysteresisActive = observation.RecoveryState == "recovered"
if observation.RecoveryHysteresisActive {
observation.RecoveryHysteresisPenalty = policy.HysteresisPenalty
}
reportObservations = append(reportObservations, observation)
}
report := &FabricServiceChannelRouteFeedbackReport{
SchemaVersion: "rap.fabric_service_channel_route_feedback_report.v1",
GeneratedAt: now.UTC(),
FeedbackMaxAgeSeconds: int(fabricServiceChannelFeedbackMaxAge.Seconds()),
RecoveryPolicy: fabricServiceChannelRecoveryPolicyRef(policy),
ObservationCount: len(observations),
Observations: reportObservations,
}
for _, observation := range reportObservations {
switch strings.ToLower(strings.TrimSpace(observation.FeedbackStatus)) {
case "fenced":
report.FencedRouteCount++
case "degraded":
report.DegradedRouteCount++
case "healthy":
report.HealthyRouteCount++
}
if observation.RecoveryState == "recovered" {
report.RecoveredRouteCount++
}
if observation.RecoveryHysteresisActive {
report.RecoveryHysteresisCount++
}
if observation.RecoveryPromoted {
report.RecoveryPromotedCount++
}
if observation.RecoveryDemoted {
report.RecoveryDemotedCount++
}
if observation.ProvenanceMissing {
report.MissingProvenanceCount++
}
if observation.StalePolicy {
report.StalePolicyCount++
}
if observation.StaleGeneration {
report.StaleGenerationCount++
}
}
return report
}
func fabricServiceChannelFeedbackObservationRecoveryState(observation FabricServiceChannelRouteFeedbackObservation, now time.Time) string {
switch strings.ToLower(strings.TrimSpace(observation.FeedbackStatus)) {
case "fenced":
return "fenced"
case "degraded":
return "degraded"
case "healthy":
if observation.RetryCooldownUntil != nil &&
observation.RetryCooldownUntil.After(now.UTC()) &&
containsString(observation.Reasons, "service_channel_rolling_quality_window") {
return "recovered"
}
return "healthy"
default:
if observation.RetryCooldownUntil != nil && observation.RetryCooldownUntil.After(now.UTC()) {
return "cooldown"
}
return ""
}
}
func fabricServiceChannelFeedbackObservationRecoveryPromoted(observation FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy) bool {
if observation.RetryCooldownUntil == nil || !observation.RetryCooldownUntil.After(now.UTC()) {
return false
}
if strings.ToLower(strings.TrimSpace(observation.FeedbackStatus)) != "healthy" ||
!containsString(observation.Reasons, "service_channel_rolling_quality_window") {
return false
}
return fabricServiceChannelFeedbackCleanRollingSamples(
fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_sample_count"),
fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_success_count"),
fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_failure_count"),
fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_slow_count"),
fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_drop_count"),
policy,
)
}
func fabricServiceChannelFeedbackObservationRecoveryDemotion(observation FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy) (bool, string) {
if observation.RetryCooldownUntil == nil || !observation.RetryCooldownUntil.After(now.UTC()) {
return false, ""
}
if observation.RecoveryPromoted {
return false, ""
}
if policy.DemotionFencedEnabled && strings.ToLower(strings.TrimSpace(observation.FeedbackStatus)) == "fenced" {
return true, "service_channel_recovery_demoted_fenced"
}
if policy.DemotionRebuildEnabled && (containsString(observation.Reasons, "service_channel_route_rebuild_recommended") ||
fabricServiceChannelFeedbackPayloadBool(observation.Payload, "route_rebuild_recommended")) {
return true, "service_channel_recovery_demoted_rebuild"
}
if fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_failure_count") >= policy.DemotionFailureThreshold ||
fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_drop_count") >= policy.DemotionDropThreshold {
return true, "service_channel_recovery_demoted_failure"
}
if fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_slow_count") >= policy.DemotionSlowThreshold {
return true, "service_channel_recovery_demoted_slow"
}
if strings.ToLower(strings.TrimSpace(observation.FeedbackStatus)) == "degraded" {
return true, "service_channel_recovery_demoted_degraded"
}
return false, ""
}
func fabricServiceChannelRoutesFromIntents(intents []MeshRouteIntent, serviceClass string, entryPool, exitPool, allowedChannels []string, generation string, now, defaultExpiresAt time.Time, feedback map[string]fabricServiceChannelRouteFeedback, policy FabricServiceChannelRecoveryPolicy) []FabricServiceChannelRoute {
policy = normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy())
routes := []FabricServiceChannelRoute{}
for _, intent := range intents {
route, ok := fabricServiceChannelRouteFromIntent(intent, serviceClass, entryPool, exitPool, allowedChannels, generation, now, defaultExpiresAt, feedback, policy)
if ok {
routes = append(routes, route)
}
}
sort.SliceStable(routes, func(i, j int) bool {
if routes[i].Status != routes[j].Status {
return routes[i].Status == "authorized"
}
if routes[i].PathScore != routes[j].PathScore {
return routes[i].PathScore > routes[j].PathScore
}
if len(routes[i].Hops) != len(routes[j].Hops) {
return len(routes[i].Hops) < len(routes[j].Hops)
}
return routes[i].RouteID < routes[j].RouteID
})
return routes
}
func fabricServiceChannelRouteFromIntent(intent MeshRouteIntent, serviceClass string, entryPool, exitPool, requestedChannels []string, generation string, now, defaultExpiresAt time.Time, feedback map[string]fabricServiceChannelRouteFeedback, recoveryPolicy FabricServiceChannelRecoveryPolicy) (FabricServiceChannelRoute, bool) {
recoveryPolicy = normalizeFabricServiceChannelRecoveryPolicy(recoveryPolicy, defaultFabricServiceChannelRecoveryPolicy())
if intent.Status != "active" || strings.TrimSpace(intent.ServiceClass) != serviceClass {
return FabricServiceChannelRoute{}, false
}
var policy syntheticRoutePolicy
if err := json.Unmarshal(intent.Policy, &policy); err != nil {
return FabricServiceChannelRoute{}, false
}
if policy.ExpiresAt != nil && !policy.ExpiresAt.After(now.UTC()) {
return FabricServiceChannelRoute{}, false
}
var source nodeSelector
var destination nodeSelector
_ = json.Unmarshal(intent.SourceSelector, &source)
_ = json.Unmarshal(intent.DestinationSelector, &destination)
sourceNodeID := firstNodeID(source)
destinationNodeID := firstNodeID(destination)
hops := append([]string{}, policy.Hops...)
if len(hops) == 0 && sourceNodeID != "" && destinationNodeID != "" {
hops = []string{sourceNodeID, destinationNodeID}
}
if len(hops) < 2 {
return FabricServiceChannelRoute{}, false
}
if sourceNodeID == "" {
sourceNodeID = hops[0]
}
if destinationNodeID == "" {
destinationNodeID = hops[len(hops)-1]
}
if !containsString(entryPool, sourceNodeID) || !containsString(exitPool, destinationNodeID) {
return FabricServiceChannelRoute{}, false
}
allowedChannels := policy.AllowedChannels
if len(allowedChannels) == 0 {
allowedChannels = requestedChannels
}
if !fabricChannelsIntersect(allowedChannels, requestedChannels) {
return FabricServiceChannelRoute{}, false
}
expiresAt := defaultExpiresAt
if policy.ExpiresAt != nil {
expiresAt = policy.ExpiresAt.UTC()
}
routeVersion := policy.RouteVersion
if routeVersion == "" {
routeVersion = intent.UpdatedAt.UTC().Format(time.RFC3339)
}
policyVersion := policy.PolicyVersion
if policyVersion == "" {
policyVersion = routeVersion
}
score := 100 - len(hops)*5 + intent.Priority
if score < 1 {
score = 1
}
status := "authorized"
recoveryState := ""
recoveryPenalty := 0
recoveryPromoted := false
recoveryDemoted := false
recoveryReason := ""
scoreReasons := []string{"active_route_intent", "entry_exit_pool_match"}
if item, ok := feedback[intent.ID]; ok {
score += item.ScoreAdjustment
scoreReasons = append(scoreReasons, item.Reasons...)
if item.StalePolicy || item.StaleGeneration {
recoveryReason = item.StaleReason
if recoveryReason == "" {
recoveryReason = "service_channel_feedback_stale"
}
scoreReasons = append(scoreReasons, "service_channel_feedback_stale", recoveryReason)
}
if fabricServiceChannelFeedbackRecoveryDemoted(item, recoveryPolicy) {
recoveryDemoted = true
recoveryReason = fabricServiceChannelFeedbackRecoveryDemotionReason(item, recoveryPolicy)
scoreReasons = append(scoreReasons, "service_channel_recovery_demoted", recoveryReason)
}
if item.Fenced {
status = "fenced_by_service_channel_feedback"
recoveryState = "fenced"
score = 0
} else if score < 1 {
score = 1
}
if status == "authorized" && fabricServiceChannelFeedbackRecoveryPromoted(item, recoveryPolicy) {
recoveryState = "healthy"
recoveryPromoted = true
scoreReasons = append(scoreReasons, "service_channel_recovery_promoted")
} else if status == "authorized" && fabricServiceChannelFeedbackRecoveryHysteresisActive(item, recoveryPolicy) {
recoveryState = "recovered"
recoveryPenalty = recoveryPolicy.HysteresisPenalty
score -= recoveryPenalty
if score < 1 {
score = 1
}
scoreReasons = append(scoreReasons, "service_channel_recovery_hysteresis")
} else if status == "authorized" && item.ScoreAdjustment > 0 {
recoveryState = "healthy"
}
}
return FabricServiceChannelRoute{
RouteID: intent.ID,
ClusterID: intent.ClusterID,
ServiceClass: serviceClass,
SourceNodeID: sourceNodeID,
DestinationNodeID: destinationNodeID,
Hops: hops,
AllowedChannels: allowedChannels,
RouteVersion: routeVersion,
PolicyVersion: policyVersion,
Generation: generation,
Status: status,
RecoveryState: recoveryState,
RecoveryPenalty: recoveryPenalty,
RecoveryPromoted: recoveryPromoted,
RecoveryDemoted: recoveryDemoted,
RecoveryReason: recoveryReason,
RecoveryPolicy: fabricServiceChannelRecoveryPolicyRef(recoveryPolicy),
PathScore: score,
ScoreReasons: dedupeStrings(scoreReasons),
ExpiresAt: expiresAt,
}, true
}
const fabricServiceChannelRecoveryHysteresisPenalty = 150
const fabricServiceChannelRecoveryPromotionMinSamples = 64
func fabricServiceChannelFeedbackRecoveryHysteresisActive(item fabricServiceChannelRouteFeedback, policy FabricServiceChannelRecoveryPolicy) bool {
if item.StalePolicy || item.StaleGeneration {
return false
}
return item.ManualRetry && !item.Fenced && item.ScoreAdjustment > 0 &&
containsString(item.Reasons, "service_channel_rolling_quality_window") &&
!fabricServiceChannelFeedbackRecoveryPromoted(item, policy)
}
func fabricServiceChannelFeedbackRecoveryPromoted(item fabricServiceChannelRouteFeedback, policy FabricServiceChannelRecoveryPolicy) bool {
if item.StalePolicy || item.StaleGeneration {
return false
}
return item.ManualRetry && !item.Fenced && item.ScoreAdjustment > 0 &&
containsString(item.Reasons, "service_channel_rolling_quality_window") &&
fabricServiceChannelFeedbackCleanRollingSamples(
item.QualityWindowSampleCount,
item.QualityWindowSuccessCount,
item.QualityWindowFailureCount,
item.QualityWindowSlowCount,
item.QualityWindowDropCount,
policy,
)
}
func fabricServiceChannelFeedbackRecoveryDemoted(item fabricServiceChannelRouteFeedback, policy FabricServiceChannelRecoveryPolicy) bool {
if item.StalePolicy || item.StaleGeneration {
return false
}
return item.ManualRetry && !fabricServiceChannelFeedbackRecoveryPromoted(item, policy) &&
((policy.DemotionFencedEnabled && item.Fenced) ||
(policy.DemotionRebuildEnabled && item.RouteRebuildRecommended) ||
item.DegradedFallbackRecommended ||
item.QualityWindowFailureCount >= policy.DemotionFailureThreshold ||
item.QualityWindowDropCount >= policy.DemotionDropThreshold ||
item.QualityWindowSlowCount >= policy.DemotionSlowThreshold ||
item.ScoreAdjustment < 0)
}
func fabricServiceChannelFeedbackRecoveryDemotionReason(item fabricServiceChannelRouteFeedback, policy FabricServiceChannelRecoveryPolicy) string {
if policy.DemotionFencedEnabled && item.Fenced {
return "service_channel_recovery_demoted_fenced"
}
if policy.DemotionRebuildEnabled && item.RouteRebuildRecommended {
return "service_channel_recovery_demoted_rebuild"
}
if item.QualityWindowFailureCount >= policy.DemotionFailureThreshold || item.QualityWindowDropCount >= policy.DemotionDropThreshold {
return "service_channel_recovery_demoted_failure"
}
if item.QualityWindowSlowCount >= policy.DemotionSlowThreshold {
return "service_channel_recovery_demoted_slow"
}
if item.DegradedFallbackRecommended {
return "service_channel_recovery_demoted_degraded_fallback"
}
if item.ScoreAdjustment < 0 {
return "service_channel_recovery_demoted_degraded"
}
return "service_channel_recovery_demoted"
}
func fabricServiceChannelFeedbackCleanRollingSamples(sampleCount, successCount, failureCount, slowCount, dropCount int, policy FabricServiceChannelRecoveryPolicy) bool {
return sampleCount >= policy.PromotionMinSamples &&
successCount >= policy.PromotionMinSamples &&
failureCount == 0 &&
slowCount == 0 &&
dropCount == 0
}
func fabricChannelsIntersect(a, b []string) bool {
for _, left := range a {
if containsString(b, left) {
return true
}
}
return false
}
func selectFabricServicePrimaryRoute(routes []FabricServiceChannelRoute, selectedEntry, selectedExit string) (FabricServiceChannelRoute, []FabricServiceChannelRoute) {
if len(routes) == 0 {
return FabricServiceChannelRoute{}, nil
}
alternates := make([]FabricServiceChannelRoute, 0, len(routes)-1)
for _, route := range routes {
if route.Status != "authorized" {
continue
}
if route.SourceNodeID == selectedEntry && route.DestinationNodeID == selectedExit {
for _, alternate := range routes {
if alternate.RouteID != route.RouteID && alternate.Status == "authorized" {
alternates = append(alternates, alternate)
}
}
return route, alternates
}
}
primary := FabricServiceChannelRoute{}
for _, route := range routes {
if route.Status != "authorized" {
continue
}
if primary.RouteID == "" {
primary = route
continue
}
alternates = append(alternates, route)
}
return primary, alternates
}
type fabricServiceChannelRouteIntentReplacementScope struct {
EntryPoolKey string
ExitPoolKey string
ResourceKey string
}
func fabricServiceChannelRouteIntentMetadataKey(intent MeshRouteIntent, keys []string) string {
if len(intent.Policy) == 0 || !json.Valid(intent.Policy) {
return ""
}
var policy syntheticRoutePolicy
if err := json.Unmarshal(intent.Policy, &policy); err != nil {
return ""
}
for _, key := range keys {
value, ok := policy.Metadata[key]
if !ok {
continue
}
switch typed := value.(type) {
case string:
if trimmed := strings.TrimSpace(typed); trimmed != "" {
return key + ":" + trimmed
}
case fmt.Stringer:
if trimmed := strings.TrimSpace(typed.String()); trimmed != "" {
return key + ":" + trimmed
}
}
}
return ""
}
func fabricServiceChannelRouteIntentReplacementScopes(intents []MeshRouteIntent) map[string]fabricServiceChannelRouteIntentReplacementScope {
out := map[string]fabricServiceChannelRouteIntentReplacementScope{}
for _, intent := range intents {
if routeID := strings.TrimSpace(intent.ID); routeID != "" {
out[routeID] = fabricServiceChannelRouteIntentReplacementScope{
EntryPoolKey: fabricServiceChannelRouteIntentMetadataKey(intent, []string{"entry_pool_id", "service_entry_pool_id", "fabric_entry_pool_id"}),
ExitPoolKey: fabricServiceChannelRouteIntentMetadataKey(intent, []string{"exit_pool_id", "service_exit_pool_id", "fabric_exit_pool_id"}),
ResourceKey: fabricServiceChannelRouteIntentMetadataKey(intent, []string{"service_resource_id", "resource_id", "fabric_service_resource_id"}),
}
}
}
return out
}
func fabricServiceChannelRoutesShareReplacementScope(fencedRoute, candidateRoute SyntheticMeshRouteConfig, scopes map[string]fabricServiceChannelRouteIntentReplacementScope) bool {
if fencedRoute.SourceNodeID == candidateRoute.SourceNodeID && fencedRoute.DestinationNodeID == candidateRoute.DestinationNodeID {
return true
}
fencedScope := scopes[fencedRoute.RouteID]
candidateScope := scopes[candidateRoute.RouteID]
sameResource := strings.TrimSpace(fencedScope.ResourceKey) != "" && fencedScope.ResourceKey == strings.TrimSpace(candidateScope.ResourceKey)
if fencedRoute.SourceNodeID == candidateRoute.SourceNodeID {
return sameResource || (strings.TrimSpace(fencedScope.ExitPoolKey) != "" && fencedScope.ExitPoolKey == strings.TrimSpace(candidateScope.ExitPoolKey))
}
if fencedRoute.DestinationNodeID == candidateRoute.DestinationNodeID {
return sameResource || (strings.TrimSpace(fencedScope.EntryPoolKey) != "" && fencedScope.EntryPoolKey == strings.TrimSpace(candidateScope.EntryPoolKey))
}
if sameResource &&
strings.TrimSpace(fencedScope.EntryPoolKey) != "" &&
fencedScope.EntryPoolKey == strings.TrimSpace(candidateScope.EntryPoolKey) &&
strings.TrimSpace(fencedScope.ExitPoolKey) != "" &&
fencedScope.ExitPoolKey == strings.TrimSpace(candidateScope.ExitPoolKey) {
return true
}
return false
}
func fabricServiceRoutesFencedForSelectedPair(routes []FabricServiceChannelRoute, selectedEntry, selectedExit string) bool {
for _, route := range routes {
if route.SourceNodeID == selectedEntry &&
route.DestinationNodeID == selectedExit &&
route.Status == "fenced_by_service_channel_feedback" {
return true
}
}
return false
}
func fabricServiceRoutesFencedForPool(routes []FabricServiceChannelRoute) bool {
for _, route := range routes {
if route.Status == "fenced_by_service_channel_feedback" {
return true
}
}
return false
}
func defaultFabricServiceQoS(serviceClass string) string {
switch serviceClass {
case FabricServiceClassVPNPackets:
return `{"priority":"bulk","interactive":false,"bulk_limit_mbps":0}`
case FabricServiceClassRemoteWorkspace:
return `{"priority":"interactive","interactive":true,"bulk_limit_mbps":0}`
case FabricServiceClassVideo:
return `{"priority":"interactive","interactive":true,"adaptive":true}`
default:
return `{"priority":"normal","interactive":false,"bulk_limit_mbps":0}`
}
}
func fabricServiceChannelHTTPIngress(serviceClass string) FabricServiceChannelHTTPIngress {
ingress := FabricServiceChannelHTTPIngress{
Type: "entry_direct_http_v1",
TokenHeader: "X-RAP-Service-Channel-Token",
ServiceClassHeader: "X-RAP-Service-Class",
ChannelClassHeader: "X-RAP-Channel-Class",
SupportedMethods: []string{"POST", "GET", "WEBSOCKET"},
}
switch serviceClass {
case FabricServiceClassRemoteWorkspace:
ingress.PathTemplate = "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/remote-workspaces/{resource_id}/streams/{channel_class}"
ingress.WebSocketPathTemplate = "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/remote-workspaces/{resource_id}/streams/ws"
ingress.PacketBatchFormat = "application/vnd.rap.remote-workspace-frame-batch.v1"
case FabricServiceClassVideo:
ingress.PathTemplate = "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/video-sessions/{resource_id}/streams/{channel_class}"
ingress.WebSocketPathTemplate = "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/video-sessions/{resource_id}/streams/ws"
ingress.PacketBatchFormat = "application/vnd.rap.video-frame-batch.v1"
case FabricServiceClassFileTransfer:
ingress.PathTemplate = "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/file-transfers/{resource_id}/chunks"
ingress.WebSocketPathTemplate = "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/file-transfers/{resource_id}/chunks/ws"
ingress.PacketBatchFormat = "application/vnd.rap.file-transfer-chunk-batch.v1"
default:
ingress.PathTemplate = "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/vpn-connections/{resource_id}/packets"
ingress.WebSocketPathTemplate = "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/vpn-connections/{resource_id}/packets/ws"
ingress.PacketBatchFormat = "application/vnd.rap.vpn-packet-batch.v1"
}
return ingress
}
func fabricServiceChannelDataPlaneContract(serviceClass string, poolPolicy FabricServiceChannelPoolPolicy, fallback FabricServiceChannelFallback) FabricServiceChannelDataPlaneContract {
backendRelayPolicy := "disabled"
if poolPolicy.BackendFallbackAllowed || fallback.Allowed || fallback.BackendRelay {
backendRelayPolicy = "degraded_fallback_only"
}
entryFailover := firstNonEmptyString(poolPolicy.EntryFailover, "automatic")
exitFailover := firstNonEmptyString(poolPolicy.ExitFailover, "automatic")
routeRebuild := firstNonEmptyString(poolPolicy.RouteRebuild, "automatic")
mode := "fabric_primary"
if fallback.Active {
mode = "degraded_backend_fallback"
}
return FabricServiceChannelDataPlaneContract{
SchemaVersion: "rap.fabric_service_channel_data_plane.v1",
Mode: mode,
ControlPlaneTransport: "backend_api",
WorkingDataTransport: "fabric_service_channel",
SteadyStateTransport: "fabric_route",
BackendRelayPolicy: backendRelayPolicy,
ProductionForwardingRequired: true,
ServiceNeutral: true,
ProtocolAgnostic: true,
LogicalFlowMode: "multi_flow_isolated",
RequiredFlowIsolationClasses: fabricServiceChannelFlowIsolationClasses(serviceClass),
RouteSelectionStrategy: firstNonEmptyString(poolPolicy.SelectionStrategy, "fastest_healthy"),
EntryFailoverMode: entryFailover,
ExitFailoverMode: exitFailover,
RouteRebuildMode: routeRebuild,
FailureDetectionSource: "route_quality_feedback_and_runtime_heartbeats",
DegradedFallbackVisibility: "explicit_access_telemetry_and_rebuild_health",
StableContractForServiceClass: serviceClass,
}
}
func fabricServiceChannelFlowIsolationClasses(serviceClass string) []string {
switch serviceClass {
case FabricServiceClassVPNPackets:
return []string{FabricChannelControl, FabricChannelInteractive, FabricChannelReliable, FabricChannelBulk, FabricChannelDroppable, "vpn_packet"}
case FabricServiceClassRemoteWorkspace:
return []string{FabricChannelControl, FabricChannelInteractive, FabricChannelReliable, FabricChannelBulk, FabricChannelDroppable}
case FabricServiceClassVideo:
return []string{FabricChannelControl, FabricChannelInteractive, FabricChannelDroppable}
case FabricServiceClassFileTransfer:
return []string{FabricChannelControl, FabricChannelReliable, FabricChannelBulk}
default:
return []string{FabricChannelControl, FabricChannelReliable}
}
}
func defaultFabricServiceFailover() string {
return `{"route_rebuild":"automatic","exit_failover":"automatic","sticky_session":true}`
}
func (s *Service) GetNodeSyntheticMeshConfig(ctx context.Context, input GetNodeSyntheticMeshConfigInput) (NodeSyntheticMeshConfig, error) {
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.NodeID = strings.TrimSpace(input.NodeID)
if input.ClusterID == "" || input.NodeID == "" {
return NodeSyntheticMeshConfig{}, ErrInvalidPayload
}
cfg := NodeSyntheticMeshConfig{
Enabled: false,
SchemaVersion: "c17z18.synthetic.v1",
ClusterID: input.ClusterID,
LocalNodeID: input.NodeID,
AuthorityRequired: true,
ConfigVersion: "disabled",
PeerDirectoryVersion: "disabled",
PolicyVersion: "disabled",
PeerEndpoints: map[string]string{},
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{},
PeerDirectory: []PeerDirectoryEntry{},
RecoverySeeds: []PeerRecoverySeed{},
RendezvousLeases: []PeerRendezvousLease{},
Routes: []SyntheticMeshRouteConfig{},
ProductionForwarding: false,
}
listenerConfig, err := s.nodeMeshListenerConfig(ctx, input)
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
cfg.MeshListener = listenerConfig
if listenerConfig != nil && listenerConfig.ProductionForwarding {
cfg.ProductionForwarding = true
}
flags, err := s.store.GetEffectiveNodeTestingFlags(ctx, input.ClusterID, input.NodeID)
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
if !flags.Enabled || !flags.SyntheticLinksEnabled {
return s.signSyntheticMeshConfig(ctx, cfg)
}
intents, err := s.store.ListRouteIntents(ctx, input.ClusterID)
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
cfg.Enabled = true
cfg.ConfigVersion = "c17z18-" + s.now().UTC().Format("20060102T150405Z")
cfg.PeerDirectoryVersion = cfg.ConfigVersion
cfg.PolicyVersion = cfg.ConfigVersion
if cfg.MeshListener != nil && cfg.MeshListener.ConfigVersion == "" {
cfg.MeshListener.ConfigVersion = cfg.ConfigVersion
}
meshLinks, err := s.store.ListMeshLinks(ctx, input.ClusterID)
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
relayPolicy := newRendezvousRelayPolicy(input.NodeID, meshLinks, s.now())
recoveryPolicy := s.fabricServiceChannelRecoveryPolicy(ctx, input.ClusterID)
cluster, err := s.store.GetCluster(ctx, input.ClusterID)
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
adaptivePolicy := fabricServiceChannelAdaptivePolicyFromCluster(cluster)
cfg.ServiceChannelAdaptivePolicy = &adaptivePolicy
routeProvenance := fabricServiceChannelRouteProvenanceFromIntents(intents)
serviceChannelFeedbackItems, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{
ClusterID: input.ClusterID,
ReporterNodeID: input.NodeID,
Now: s.now(),
})
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
cfg.ServiceChannelFeedback = serviceChannelRouteFeedbackReportWithPolicyAndProvenance(serviceChannelFeedbackItems, s.now(), recoveryPolicy, routeProvenance)
serviceChannelFeedback := fabricServiceChannelRouteFeedbackFromObservationsWithProvenance(serviceChannelFeedbackItems, s.now(), recoveryPolicy, routeProvenance)
cfg.ServiceChannelRemediationCommands, err = s.fabricServiceChannelRemediationCommandsForNode(ctx, input.ClusterID, input.NodeID, serviceChannelFeedback, s.now())
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
if err := s.recordFabricServiceChannelRemediationRebuildIntents(ctx, input.ClusterID, input.NodeID, cfg.ServiceChannelRemediationCommands, s.now()); err != nil {
return NodeSyntheticMeshConfig{}, err
}
remediationRoutePathDecisions, err := s.resolveFabricServiceChannelRemediationRebuildIntents(ctx, input, cfg.ServiceChannelRemediationCommands, intents, serviceChannelFeedback, cfg.ConfigVersion, s.now())
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
serviceChannelExpiredFeedbackItems, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{
ClusterID: input.ClusterID,
ReporterNodeID: input.NodeID,
IncludeExpired: true,
Now: s.now(),
})
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
mergeFabricServiceChannelRouteFeedback(serviceChannelFeedback, fabricServiceChannelManualRetryFeedbackFromObservationsWithProvenance(serviceChannelExpiredFeedbackItems, s.now(), recoveryPolicy, routeProvenance))
localPerspective, err := s.localEndpointPerspective(ctx, input.ClusterID, input.NodeID)
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
peerDirectory := map[string]*PeerDirectoryEntry{}
recoverySeeds := map[string]PeerRecoverySeed{}
rendezvousLeases := map[string]PeerRendezvousLease{}
routePathDecisions := append([]RoutePathDecision{}, remediationRoutePathDecisions...)
for _, intent := range intents {
route, peers, candidates, seeds, policyLeases, ok := s.syntheticRouteFromIntent(input, intent)
if !ok {
continue
}
if feedback, ok := serviceChannelFeedback[route.RouteID]; ok && feedback.Fenced {
replacementDecision := s.serviceChannelRouteReplacementDecision(input, route, intents, serviceChannelFeedback, cfg.ConfigVersion)
routePathDecisions = append(routePathDecisions, replacementDecision)
continue
}
reportedPeers, reportedCandidates, err := s.reportedEndpointConfig(ctx, input.ClusterID, input.NodeID, route.Hops, localPerspective)
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
feedback, err := s.rendezvousRelayFeedback(ctx, input.ClusterID, route.Hops, s.now())
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
relayPolicy.addFeedback(feedback)
replacementHints, err := s.rendezvousRelayReplacementHints(ctx, input.ClusterID, route.Hops, s.now())
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
relayPolicy.addReplacementHints(replacementHints)
relayPolicy.addFeedback(replacementHintFeedback(replacementHints, s.now()))
relayPolicy.addFeedback(rendezvousRelayRouteHealthFeedback(input.NodeID, route, meshLinks, s.now()))
for nodeID, endpoint := range reportedPeers {
peers[nodeID] = endpoint
}
for nodeID, items := range reportedCandidates {
candidates[nodeID] = append(candidates[nodeID], items...)
}
routeLeases := scopedRendezvousLeases(policyLeases, route, input.NodeID, relayPolicy, s.now())
routeLeases = append(routeLeases, derivedRendezvousLeases(route, peers, candidates, input.NodeID, relayPolicy, s.now())...)
cfg.Routes = append(cfg.Routes, route)
routePathDecisions = append(routePathDecisions, routePathDecisionForRoute(route, input.NodeID, routeLeases, relayPolicy, cfg.ConfigVersion, serviceChannelFeedback[route.RouteID]))
mergePeerDirectoryRoute(peerDirectory, route, input.NodeID)
for nodeID, endpoint := range peers {
if strings.TrimSpace(nodeID) != "" && strings.TrimSpace(endpoint) != "" {
cfg.PeerEndpoints[nodeID] = endpoint
peerDirectoryEntry(peerDirectory, nodeID).EndpointCount++
}
}
for nodeID, nodeCandidates := range candidates {
if strings.TrimSpace(nodeID) == "" || len(nodeCandidates) == 0 {
continue
}
cfg.PeerEndpointCandidates[nodeID] = append(cfg.PeerEndpointCandidates[nodeID], nodeCandidates...)
mergePeerDirectoryCandidates(peerDirectory, nodeID, nodeCandidates)
}
mergeRecoverySeeds(recoverySeeds, seeds)
mergeRendezvousLeases(rendezvousLeases, routeLeases)
}
if err := s.addCoreMeshBootstrapPeers(ctx, input, &cfg, peerDirectory, recoverySeeds, rendezvousLeases, localPerspective); err != nil {
return NodeSyntheticMeshConfig{}, err
}
cfg.RecoverySeeds = sortedRecoverySeeds(recoverySeeds, maxScopedRecoverySeeds)
cfg.RendezvousLeases = sortedRendezvousLeases(rendezvousLeases, maxScopedRendezvousLeases)
cfg.RendezvousRelayPolicy = relayPolicy.report()
cfg.RoutePathDecisions = routePathDecisionReportWithRecoveryPolicy(cfg.ConfigVersion, routePathDecisions, recoveryPolicy)
_ = s.recordFabricServiceChannelRouteRebuildAttempts(ctx, input, cfg.RoutePathDecisions, cfg.ServiceChannelFeedback)
markPeerDirectoryRecoverySeeds(peerDirectory, cfg.RecoverySeeds)
markPeerDirectoryRendezvousLeases(peerDirectory, cfg.RendezvousLeases, input.NodeID)
cfg.PeerDirectory = sortedPeerDirectory(peerDirectory)
return s.signSyntheticMeshConfig(ctx, cfg)
}
func (s *Service) recordFabricServiceChannelRouteRebuildAttempts(ctx context.Context, input GetNodeSyntheticMeshConfigInput, report *RoutePathDecisionReport, feedbackReport *FabricServiceChannelRouteFeedbackReport) error {
if report == nil || len(report.Decisions) == 0 {
return nil
}
feedbackByRoute := map[string]FabricServiceChannelRouteFeedbackObservation{}
if feedbackReport != nil {
for _, item := range feedbackReport.Observations {
if strings.TrimSpace(item.RouteID) != "" {
feedbackByRoute[item.RouteID] = item
}
}
}
for _, decision := range report.Decisions {
if strings.TrimSpace(decision.RebuildRequestID) == "" {
continue
}
feedback := feedbackByRoute[decision.RouteID]
serviceClass := firstNonEmptyString(feedback.ServiceClass, FabricServiceClassVPNPackets)
outcome := "degraded_fallback"
if strings.TrimSpace(decision.ReplacementRouteID) != "" {
outcome = "replacement_selected"
} else if decision.DecisionSource == "service_channel_feedback_no_alternate" {
outcome = "no_alternate"
}
payload := mustJSONRaw(map[string]any{
"schema_version": "c18z98.route_rebuild_attempt_correlation.v1",
"decision_id": decision.DecisionID,
"score_reasons": decision.ScoreReasons,
"path_score": decision.PathScore,
"local_role": decision.LocalRole,
"previous_hop_id": decision.PreviousHopID,
"next_hop_id": decision.NextHopID,
"control_plane_only": decision.ControlPlaneOnly,
"production_forwarding": decision.ProductionForwarding,
"decision_expires_at": decision.ExpiresAt.UTC().Format(time.RFC3339Nano),
"feedback_observation_id": decision.FeedbackObservationID,
"feedback_source": decision.FeedbackSource,
"feedback_observed_at": formatOptionalTime(decision.FeedbackObservedAt),
"feedback_expires_at": formatOptionalTime(decision.FeedbackExpiresAt),
"feedback_channel_id": decision.FeedbackChannelID,
"feedback_resource_id": decision.FeedbackResourceID,
"feedback_violation_status": decision.FeedbackViolationStatus,
"feedback_violation_reason": decision.FeedbackViolationReason,
})
_, err := s.store.RecordFabricServiceChannelRouteRebuildAttempt(ctx, RecordFabricServiceChannelRouteRebuildAttemptInput{
ClusterID: input.ClusterID,
ReporterNodeID: input.NodeID,
ServiceClass: serviceClass,
RouteID: decision.RouteID,
ReplacementRouteID: decision.ReplacementRouteID,
RebuildRequestID: decision.RebuildRequestID,
RebuildStatus: decision.RebuildStatus,
RebuildReason: decision.RebuildReason,
RebuildAttempt: decision.RebuildAttempt,
DecisionSource: decision.DecisionSource,
Outcome: outcome,
Generation: decision.Generation,
PolicyFingerprint: feedback.EffectivePolicyFingerprint,
ObservedPolicyFingerprint: feedback.ObservedPolicyFingerprint,
ObservedRouteGeneration: feedback.ObservedRouteGeneration,
EffectiveRouteGeneration: feedback.EffectiveRouteGeneration,
FeedbackStatus: feedback.FeedbackStatus,
FeedbackObservationID: decision.FeedbackObservationID,
FeedbackSource: decision.FeedbackSource,
FeedbackObservedAt: decision.FeedbackObservedAt,
FeedbackExpiresAt: decision.FeedbackExpiresAt,
FeedbackChannelID: decision.FeedbackChannelID,
FeedbackResourceID: decision.FeedbackResourceID,
FeedbackViolationStatus: decision.FeedbackViolationStatus,
FeedbackViolationReason: decision.FeedbackViolationReason,
FeedbackScoreAdjustment: feedback.ScoreAdjustment,
FeedbackEffectiveScoreAdjustment: feedback.EffectiveScoreAdjustment,
FeedbackReasons: append([]string{}, feedback.Reasons...),
LastError: feedback.LastError,
ConsecutiveFailures: feedback.ConsecutiveFailures,
StallCount: feedback.StallCount,
LastSendDurationMs: feedback.LastSendDurationMs,
OldHops: append([]string{}, decision.OriginalHops...),
ReplacementHops: append([]string{}, decision.EffectiveHops...),
Payload: payload,
})
if err != nil {
return err
}
}
return nil
}
func (s *Service) autoWarmFabricServiceChannelRouteRebuildAttemptSnapshot(ctx context.Context, clusterID string, attempt FabricServiceChannelRouteRebuildAttempt, now time.Time) (bool, error) {
if fabricServiceChannelRouteRebuildHasCorrelationSnapshot(attempt) {
return false, nil
}
nodeID := strings.TrimSpace(attempt.ReporterNodeID)
if nodeID == "" {
return false, ErrInvalidPayload
}
if now.IsZero() {
now = time.Now().UTC()
}
heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 120)
if err != nil {
return false, err
}
attempt = enrichFabricServiceChannelRouteRebuildAttempt(attempt, heartbeats, now)
if !attempt.NodeTransitionMatched && !attempt.NodeRouteGenerationMatched && attempt.PostRebuildSelectedRouteID == "" && attempt.PostRebuildSendPackets == 0 && attempt.PostRebuildSendFlowPackets == 0 {
return false, nil
}
attempt.CorrelationSnapshotAt = &now
if err := s.store.UpdateFabricServiceChannelRouteRebuildCorrelationSnapshot(ctx, fabricServiceChannelRouteRebuildCorrelationSnapshotInput(attempt, now)); err != nil {
return false, err
}
return true, nil
}
func formatOptionalTime(value *time.Time) string {
if value == nil || value.IsZero() {
return ""
}
return value.UTC().Format(time.RFC3339Nano)
}
func (s *Service) autoWarmFabricServiceChannelRouteRebuildSnapshotsAfterHeartbeat(ctx context.Context, heartbeat NodeHeartbeat) error {
clusterID := strings.TrimSpace(heartbeat.ClusterID)
nodeID := strings.TrimSpace(heartbeat.NodeID)
if clusterID == "" || nodeID == "" {
return nil
}
now := heartbeat.ObservedAt
if now.IsZero() {
now = s.now()
}
if now.IsZero() {
now = time.Now().UTC()
}
attempts, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{
ClusterID: clusterID,
ReporterNodeID: nodeID,
Limit: 5,
})
if err != nil {
return err
}
warmedCount := 0
freshCount := 0
errorCount := 0
warmedAttemptIDs := []string{}
warmedRouteIDs := []string{}
warmedRebuildRequestIDs := []string{}
warmedGenerations := []string{}
for _, attempt := range attempts {
if fabricServiceChannelRouteRebuildHasCorrelationSnapshot(attempt) {
freshCount++
continue
}
warmed, err := s.autoWarmFabricServiceChannelRouteRebuildAttemptSnapshot(ctx, clusterID, attempt, now)
if err != nil {
errorCount++
continue
}
if warmed {
warmedCount++
warmedAttemptIDs = append(warmedAttemptIDs, attempt.ID)
warmedRouteIDs = append(warmedRouteIDs, attempt.RouteID)
warmedRebuildRequestIDs = append(warmedRebuildRequestIDs, attempt.RebuildRequestID)
warmedGenerations = append(warmedGenerations, attempt.Generation)
} else {
freshCount++
}
}
if warmedCount == 0 && errorCount == 0 {
return nil
}
targetID := nodeID
return s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &clusterID,
EventType: "fabric.service_channel_rebuild_snapshot.auto_warmup",
TargetType: "fabric_service_channel_route_rebuild_snapshot",
TargetID: &targetID,
Payload: mustJSONRaw(map[string]any{
"schema_version": "c18z45.rebuild_snapshot_auto_warmup.v1",
"trigger": "node_heartbeat",
"reporter_node_id": nodeID,
"heartbeat_id": heartbeat.ID,
"scanned_count": len(attempts),
"warmed_count": warmedCount,
"already_fresh_count": freshCount,
"error_count": errorCount,
"warmed_attempt_ids": warmedAttemptIDs,
"warmed_route_ids": warmedRouteIDs,
"warmed_rebuild_ids": warmedRebuildRequestIDs,
"warmed_generations": warmedGenerations,
}),
CreatedAt: now.UTC(),
})
}
func (s *Service) nodeMeshListenerConfig(ctx context.Context, input GetNodeSyntheticMeshConfigInput) (*NodeMeshListenerConfig, error) {
workloads, err := s.store.ListDesiredWorkloads(ctx, input.ClusterID, input.NodeID)
if err != nil {
return nil, err
}
for _, workload := range workloads {
if strings.TrimSpace(workload.ServiceType) != "mesh-listener" {
continue
}
cfg, err := nodeMeshListenerConfigFromDesired(workload)
if err != nil {
return nil, err
}
return cfg, nil
}
return nil, nil
}
func (s *Service) desiredMeshListenerEndpointConfig(ctx context.Context, clusterID, nodeID string, priority int) (string, []PeerEndpointCandidate, error) {
listener, err := s.nodeMeshListenerConfig(ctx, GetNodeSyntheticMeshConfigInput{ClusterID: clusterID, NodeID: nodeID})
if err != nil {
return "", nil, err
}
if listener == nil ||
strings.TrimSpace(listener.DesiredState) != "enabled" ||
strings.TrimSpace(listener.AdvertiseEndpoint) == "" {
return "", nil, nil
}
endpoint := strings.TrimRight(strings.TrimSpace(listener.AdvertiseEndpoint), "/")
if isUnusableLocalPeerEndpoint(endpoint) {
return "", nil, nil
}
transport := firstNonEmptyString(listener.AdvertiseTransport, "direct_http")
connectivityMode := firstNonEmptyString(listener.ConnectivityMode, "direct")
natType := firstNonEmptyString(listener.NATType, "unknown")
metadata, err := json.Marshal(map[string]any{
"source": "desired_workload.mesh-listener",
"config_version": listener.ConfigVersion,
"listen_addr": listener.ListenAddr,
})
if err != nil {
return "", nil, err
}
candidate := PeerEndpointCandidate{
EndpointID: nodeID + "-desired-mesh-listener",
NodeID: nodeID,
Transport: transport,
Address: endpoint,
Reachability: reachabilityFromConnectivityMode(connectivityMode),
NATType: natType,
ConnectivityMode: connectivityMode,
Region: listener.Region,
Priority: priority,
PolicyTags: []string{"operator-configured", "desired-mesh-listener"},
Metadata: metadata,
}
if err := validatePeerEndpointCandidates(map[string][]PeerEndpointCandidate{nodeID: []PeerEndpointCandidate{candidate}}, []string{nodeID}); err != nil {
return "", nil, err
}
return endpoint, []PeerEndpointCandidate{candidate}, nil
}
func nodeMeshListenerConfigFromDesired(workload NodeWorkloadDesiredState) (*NodeMeshListenerConfig, error) {
var raw map[string]any
if len(workload.Config) > 0 {
if err := json.Unmarshal(workload.Config, &raw); err != nil {
return nil, ErrInvalidPayload
}
}
value := func(key string) string {
if raw == nil {
return ""
}
if text, ok := raw[key].(string); ok {
return strings.TrimSpace(text)
}
return ""
}
intValue := func(key string) int {
if raw == nil {
return 0
}
switch v := raw[key].(type) {
case float64:
return int(v)
case int:
return v
}
return 0
}
boolValue := func(key string) bool {
if raw == nil {
return false
}
switch v := raw[key].(type) {
case bool:
return v
case string:
switch strings.ToLower(strings.TrimSpace(v)) {
case "1", "true", "yes", "enabled":
return true
default:
return false
}
}
return false
}
mode := strings.ToLower(value("listen_port_mode"))
if workload.DesiredState != "enabled" {
mode = "disabled"
}
if mode == "" {
mode = "manual"
}
switch mode {
case "manual", "auto", "disabled":
default:
return nil, ErrInvalidPayload
}
listenAddr := value("listen_addr")
if listenAddr == "" && mode != "disabled" {
listenAddr = ":19131"
}
start := intValue("auto_port_start")
end := intValue("auto_port_end")
if start <= 0 {
start = 19131
}
if end <= 0 {
end = 19231
}
if start > end {
return nil, ErrInvalidPayload
}
productionForwarding := boolValue("production_forwarding") || boolValue("production_forwarding_enabled")
return &NodeMeshListenerConfig{
SchemaVersion: "c17z23.mesh_listener_config.v1",
Source: "desired_workload.mesh-listener",
DesiredState: firstNonEmptyString(workload.DesiredState, "disabled"),
ListenAddr: listenAddr,
ListenPortMode: mode,
AutoPortStart: start,
AutoPortEnd: end,
AdvertiseEndpoint: strings.TrimRight(value("advertise_endpoint"), "/"),
AdvertiseTransport: value("advertise_transport"),
ConnectivityMode: value("connectivity_mode"),
NATType: value("nat_type"),
Region: value("region"),
ConfigVersion: stringPtrValue(workload.Version),
UpdatedByUserID: stringPtrValue(workload.UpdatedByUserID),
UpdatedAt: workload.UpdatedAt.UTC().Format(time.RFC3339Nano),
ControlPlaneOnly: !productionForwarding,
ProductionForwarding: productionForwarding,
}, nil
}
func (s *Service) addCoreMeshBootstrapPeers(ctx context.Context, input GetNodeSyntheticMeshConfigInput, cfg *NodeSyntheticMeshConfig, peerDirectory map[string]*PeerDirectoryEntry, recoverySeeds map[string]PeerRecoverySeed, rendezvousLeases map[string]PeerRendezvousLease, localPerspective endpointPerspective) error {
roles, err := s.store.ListNodeRoleAssignments(ctx, input.ClusterID, input.NodeID)
if err != nil {
return err
}
if !hasActiveNodeRole(roles, "core-mesh") {
return nil
}
nodes, err := s.store.ListClusterNodes(ctx, input.ClusterID)
if err != nil {
return err
}
sort.SliceStable(nodes, func(i, j int) bool {
if nodes[i].HealthStatus != nodes[j].HealthStatus {
return nodes[i].HealthStatus == "healthy"
}
iSeen := nodeLastSeen(nodes[i])
jSeen := nodeLastSeen(nodes[j])
if !iSeen.Equal(jSeen) {
return iSeen.After(jSeen)
}
return nodes[i].CreatedAt.Before(nodes[j].CreatedAt)
})
added := 0
for _, node := range nodes {
if node.ID == input.NodeID ||
node.ID == "" ||
node.MembershipStatus != "active" ||
node.RegistrationStatus != NodeRegistrationActive ||
node.HealthStatus != "healthy" {
continue
}
desiredEndpoint, desiredCandidates, err := s.desiredMeshListenerEndpointConfig(ctx, input.ClusterID, node.ID, added)
if err != nil {
return err
}
if added >= defaultCoreMeshBootstrapPeerTarget && !hasDirectUsableEndpointCandidate(desiredCandidates) {
continue
}
heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, node.ID, 1)
if err != nil {
return err
}
if len(heartbeats) == 0 && desiredEndpoint == "" && len(desiredCandidates) == 0 {
continue
}
endpoint := desiredEndpoint
candidates := append([]PeerEndpointCandidate{}, desiredCandidates...)
if len(heartbeats) > 0 {
reportedEndpoint, reportedCandidates, ok := endpointReportFromHeartbeat(heartbeats[0])
if ok {
if endpoint == "" {
endpoint = reportedEndpoint
}
candidates = append(candidates, reportedCandidates...)
}
}
endpoint, candidates = scopeEndpointReportForLocal(localPerspective, endpoint, candidates)
if endpoint != "" {
cfg.PeerEndpoints[node.ID] = endpoint
peerDirectoryEntry(peerDirectory, node.ID).EndpointCount++
}
if len(candidates) > 0 {
cfg.PeerEndpointCandidates[node.ID] = append(cfg.PeerEndpointCandidates[node.ID], candidates...)
mergePeerDirectoryCandidates(peerDirectory, node.ID, candidates)
if lease, ok := controlPlaneBootstrapRendezvousLease(input.ClusterID, node.ID, candidates, localPerspective, s.now()); ok {
mergeRendezvousLeases(rendezvousLeases, []PeerRendezvousLease{lease})
}
}
seed := recoverySeedFromEndpointReport(node.ID, endpoint, candidates, added)
if seed.NodeID != "" && !endpointCandidateRequiresRendezvous(PeerEndpointCandidate{
Address: seed.Endpoint,
Transport: seed.Transport,
ConnectivityMode: seed.ConnectivityMode,
Reachability: reachabilityFromConnectivityMode(seed.ConnectivityMode),
}) {
mergeRecoverySeeds(recoverySeeds, []PeerRecoverySeed{seed})
}
added++
}
return nil
}
func hasDirectUsableEndpointCandidate(candidates []PeerEndpointCandidate) bool {
for _, candidate := range candidates {
if strings.TrimSpace(candidate.Address) != "" &&
!endpointCandidatePrivateForOffsite(candidate) &&
!endpointCandidateRequiresRendezvous(candidate) {
return true
}
}
return false
}
func (s *Service) signSyntheticMeshConfig(ctx context.Context, cfg NodeSyntheticMeshConfig) (NodeSyntheticMeshConfig, error) {
authorityKey, err := s.ensureClusterAuthority(ctx, cfg.ClusterID, nil)
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
cfg.AuthorityRequired = true
cfg.ClusterAuthority = authorityDescriptor(authorityKey)
unsigned := cfg
unsigned.AuthorityPayload = nil
unsigned.AuthoritySignature = nil
rawConfig, err := json.Marshal(unsigned)
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
configHash, err := clusterauth.HashRaw(rawConfig)
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
issuedAt := s.now().UTC()
payload := clusterMeshConfigAuthorityPayload{
SchemaVersion: clusterMeshConfigAuthoritySchema,
ClusterID: cfg.ClusterID,
LocalNodeID: cfg.LocalNodeID,
ConfigVersion: cfg.ConfigVersion,
ConfigSHA256: configHash,
IssuedAt: issuedAt,
ExpiresAt: issuedAt.Add(5 * time.Minute),
ControlPlaneOnly: !cfg.ProductionForwarding,
ProductionForwarding: cfg.ProductionForwarding,
}
rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, issuedAt)
if err != nil {
return NodeSyntheticMeshConfig{}, err
}
cfg.AuthorityPayload = rawPayload
cfg.AuthoritySignature = &signature
return cfg, nil
}
func (s *Service) RecordNodeTelemetry(ctx context.Context, input RecordNodeTelemetryInput) (NodeTelemetryObservation, error) {
if input.ClusterID == "" || input.NodeID == "" {
return NodeTelemetryObservation{}, ErrInvalidPayload
}
input.Payload = defaultJSON(input.Payload, `{}`)
if !json.Valid(input.Payload) {
return NodeTelemetryObservation{}, errors.New("telemetry payload must be valid json")
}
if input.ObservedAt.IsZero() {
input.ObservedAt = s.now()
}
return s.store.RecordNodeTelemetry(ctx, input)
}
func (s *Service) ListNodeTelemetry(ctx context.Context, actorUserID, clusterID, nodeID string, limit int) ([]NodeTelemetryObservation, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListNodeTelemetry(ctx, clusterID, nodeID, limit)
}
func (s *Service) SetDesiredWorkload(ctx context.Context, input SetDesiredWorkloadInput) (NodeWorkloadDesiredState, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return NodeWorkloadDesiredState{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return NodeWorkloadDesiredState{}, err
}
input.ServiceType = strings.TrimSpace(input.ServiceType)
if input.ClusterID == "" || input.NodeID == "" || input.ServiceType == "" {
return NodeWorkloadDesiredState{}, ErrInvalidPayload
}
if input.DesiredState == "" {
input.DesiredState = "disabled"
}
if input.RuntimeMode == "" {
input.RuntimeMode = "container"
}
input.Config = defaultJSON(input.Config, `{}`)
input.Environment = defaultJSON(input.Environment, `{}`)
if !json.Valid(input.Config) || !json.Valid(input.Environment) {
return NodeWorkloadDesiredState{}, errors.New("config and environment must be valid json")
}
item, err := s.store.SetDesiredWorkload(ctx, input)
if err != nil {
return NodeWorkloadDesiredState{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "node_workload.desired_state_set",
TargetType: "node",
TargetID: &input.NodeID,
Payload: json.RawMessage(`{"supervision_runtime":"stub_c5"}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) ListDesiredWorkloads(ctx context.Context, actorUserID, clusterID, nodeID string) ([]NodeWorkloadDesiredState, error) {
actorUserID = strings.TrimSpace(actorUserID)
if actorUserID != "" {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
}
if clusterID == "" || nodeID == "" {
return nil, ErrInvalidPayload
}
return s.store.ListDesiredWorkloads(ctx, clusterID, nodeID)
}
func (s *Service) ReportWorkloadStatus(ctx context.Context, input ReportWorkloadStatusInput) (NodeWorkloadStatus, error) {
input.ServiceType = strings.TrimSpace(input.ServiceType)
if input.ClusterID == "" || input.NodeID == "" || input.ServiceType == "" {
return NodeWorkloadStatus{}, ErrInvalidPayload
}
if input.ReportedState == "" {
input.ReportedState = "unknown"
}
if input.RuntimeMode == "" {
input.RuntimeMode = "container"
}
input.StatusPayload = defaultJSON(input.StatusPayload, `{}`)
if !json.Valid(input.StatusPayload) {
return NodeWorkloadStatus{}, errors.New("status_payload must be valid json")
}
return s.store.ReportWorkloadStatus(ctx, input)
}
func (s *Service) ListLatestWorkloadStatuses(ctx context.Context, actorUserID, clusterID, nodeID string) ([]NodeWorkloadStatus, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListLatestWorkloadStatuses(ctx, clusterID, nodeID)
}
func (s *Service) ReportMeshLink(ctx context.Context, input ReportMeshLinkInput) (MeshLinkObservation, error) {
if input.ClusterID == "" || input.SourceNodeID == "" || input.TargetNodeID == "" {
return MeshLinkObservation{}, ErrInvalidPayload
}
if input.LinkStatus == "" {
input.LinkStatus = "unknown"
}
input.Metadata = defaultJSON(input.Metadata, `{}`)
if !json.Valid(input.Metadata) {
return MeshLinkObservation{}, errors.New("metadata must be valid json")
}
return s.store.ReportMeshLink(ctx, input)
}
func (s *Service) ListMeshLinks(ctx context.Context, actorUserID, clusterID string) ([]MeshLinkObservation, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListMeshLinks(ctx, clusterID)
}
func (s *Service) CreateRouteIntent(ctx context.Context, input CreateRouteIntentInput) (MeshRouteIntent, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return MeshRouteIntent{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return MeshRouteIntent{}, err
}
if input.ClusterID == "" || input.ServiceClass == "" {
return MeshRouteIntent{}, ErrInvalidPayload
}
if input.Priority == 0 {
input.Priority = 100
}
input.SourceSelector = defaultJSON(input.SourceSelector, `{}`)
input.DestinationSelector = defaultJSON(input.DestinationSelector, `{}`)
input.Policy = defaultJSON(input.Policy, `{}`)
if !json.Valid(input.SourceSelector) || !json.Valid(input.DestinationSelector) || !json.Valid(input.Policy) {
return MeshRouteIntent{}, errors.New("source_selector, destination_selector, and policy must be valid json")
}
item, err := s.store.CreateRouteIntent(ctx, input)
if err != nil {
return MeshRouteIntent{}, err
}
item = routeIntentWithLifecycle(item, s.now())
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "mesh.route_intent.created",
TargetType: "mesh_route_intent",
TargetID: &item.ID,
Payload: json.RawMessage(`{"traffic_forwarding_enabled":false}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) ListRouteIntents(ctx context.Context, actorUserID, clusterID string) ([]MeshRouteIntent, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
items, err := s.store.ListRouteIntents(ctx, clusterID)
if err != nil {
return nil, err
}
return routeIntentsWithLifecycle(items, s.now()), nil
}
func (s *Service) ExpireRouteIntent(ctx context.Context, input RouteIntentLifecycleInput) (MeshRouteIntent, error) {
input.ActorUserID = strings.TrimSpace(input.ActorUserID)
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.RouteIntentID = strings.TrimSpace(input.RouteIntentID)
input.Reason = strings.TrimSpace(input.Reason)
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return MeshRouteIntent{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return MeshRouteIntent{}, err
}
if input.ClusterID == "" || input.RouteIntentID == "" {
return MeshRouteIntent{}, ErrInvalidPayload
}
if input.Reason == "" {
input.Reason = "operator expired route intent"
}
expiresAt := s.now().UTC()
item, err := s.store.ExpireRouteIntent(ctx, input, expiresAt)
if err != nil {
return MeshRouteIntent{}, err
}
item = routeIntentWithLifecycle(item, s.now())
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "mesh.route_intent.expired",
TargetType: "mesh_route_intent",
TargetID: &item.ID,
Payload: mustJSONRaw(map[string]any{"reason": input.Reason, "expires_at": expiresAt.Format(time.RFC3339Nano)}),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) DisableRouteIntent(ctx context.Context, input RouteIntentLifecycleInput) (MeshRouteIntent, error) {
input.ActorUserID = strings.TrimSpace(input.ActorUserID)
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.RouteIntentID = strings.TrimSpace(input.RouteIntentID)
input.Reason = strings.TrimSpace(input.Reason)
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return MeshRouteIntent{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return MeshRouteIntent{}, err
}
if input.ClusterID == "" || input.RouteIntentID == "" {
return MeshRouteIntent{}, ErrInvalidPayload
}
if input.Reason == "" {
input.Reason = "operator disabled route intent"
}
item, err := s.store.DisableRouteIntent(ctx, input)
if err != nil {
return MeshRouteIntent{}, err
}
item = routeIntentWithLifecycle(item, s.now())
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "mesh.route_intent.disabled",
TargetType: "mesh_route_intent",
TargetID: &item.ID,
Payload: mustJSONRaw(map[string]any{"reason": input.Reason}),
CreatedAt: s.now(),
})
return item, nil
}
func routeIntentsWithLifecycle(items []MeshRouteIntent, now time.Time) []MeshRouteIntent {
out := make([]MeshRouteIntent, 0, len(items))
for _, item := range items {
out = append(out, routeIntentWithLifecycle(item, now))
}
return out
}
func routeIntentWithLifecycle(item MeshRouteIntent, now time.Time) MeshRouteIntent {
item.LifecycleStatus = strings.TrimSpace(item.Status)
var policy syntheticRoutePolicy
if err := json.Unmarshal(item.Policy, &policy); err == nil && policy.ExpiresAt != nil {
expiresAt := policy.ExpiresAt.UTC()
item.PolicyExpiresAt = &expiresAt
if !expiresAt.After(now.UTC()) {
item.IsExpired = true
}
}
switch {
case item.Status == "disabled":
item.LifecycleStatus = "disabled"
case item.IsExpired:
item.LifecycleStatus = "expired"
case item.LifecycleStatus == "":
item.LifecycleStatus = "active"
}
return item
}
func (s *Service) ListQoSPolicies(ctx context.Context, actorUserID, clusterID string) ([]MeshQoSPolicy, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListQoSPolicies(ctx, clusterID)
}
func (s *Service) ListFabricEntryPoints(ctx context.Context, actorUserID, clusterID string) ([]FabricEntryPoint, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListFabricEntryPoints(ctx, clusterID)
}
func (s *Service) CreateFabricEntryPoint(ctx context.Context, input CreateFabricEntryPointInput) (FabricEntryPoint, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return FabricEntryPoint{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return FabricEntryPoint{}, err
}
input.Name = strings.TrimSpace(input.Name)
input.Status = strings.TrimSpace(input.Status)
input.EndpointType = strings.TrimSpace(input.EndpointType)
if input.Status == "" {
input.Status = "active"
}
if input.EndpointType == "" {
input.EndpointType = "client_access"
}
if input.ClusterID == "" || input.Name == "" || !isFabricEndpointStatus(input.Status) || !isFabricEntryPointType(input.EndpointType) {
return FabricEntryPoint{}, ErrInvalidPayload
}
if input.PublicEndpoint != nil {
trimmed := strings.TrimSpace(*input.PublicEndpoint)
if trimmed == "" {
input.PublicEndpoint = nil
} else {
input.PublicEndpoint = &trimmed
}
}
input.Policy = defaultJSON(input.Policy, `{}`)
input.Metadata = defaultJSON(input.Metadata, `{}`)
if !json.Valid(input.Policy) || !json.Valid(input.Metadata) {
return FabricEntryPoint{}, errors.New("entry point policy and metadata must be valid json")
}
item, err := s.store.CreateFabricEntryPoint(ctx, input)
if err != nil {
return FabricEntryPoint{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "fabric.entry_point.created",
TargetType: "fabric_entry_point",
TargetID: &item.ID,
Payload: json.RawMessage(`{"runtime_routing_enabled":false}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) SetFabricEntryPointNode(ctx context.Context, input SetFabricEntryPointNodeInput) (FabricEntryPointNode, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return FabricEntryPointNode{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return FabricEntryPointNode{}, err
}
input.Status = strings.TrimSpace(input.Status)
if input.Status == "" {
input.Status = "active"
}
if input.Priority <= 0 {
input.Priority = 100
}
if input.ClusterID == "" || input.EntryPointID == "" || input.NodeID == "" || !isFabricEndpointStatus(input.Status) {
return FabricEntryPointNode{}, ErrInvalidPayload
}
input.Metadata = defaultJSON(input.Metadata, `{}`)
if !json.Valid(input.Metadata) {
return FabricEntryPointNode{}, errors.New("entry point node metadata must be valid json")
}
return s.store.SetFabricEntryPointNode(ctx, input)
}
func (s *Service) ListFabricEntryPointNodes(ctx context.Context, actorUserID, clusterID, entryPointID string) ([]FabricEntryPointNode, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
if clusterID == "" || entryPointID == "" {
return nil, ErrInvalidPayload
}
return s.store.ListFabricEntryPointNodes(ctx, clusterID, entryPointID)
}
func (s *Service) ListFabricEgressPools(ctx context.Context, actorUserID, clusterID string) ([]FabricEgressPool, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListFabricEgressPools(ctx, clusterID)
}
func (s *Service) CreateFabricEgressPool(ctx context.Context, input CreateFabricEgressPoolInput) (FabricEgressPool, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return FabricEgressPool{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return FabricEgressPool{}, err
}
input.Name = strings.TrimSpace(input.Name)
input.Status = strings.TrimSpace(input.Status)
if input.Status == "" {
input.Status = "active"
}
if input.ClusterID == "" || input.Name == "" || !isFabricEndpointStatus(input.Status) {
return FabricEgressPool{}, ErrInvalidPayload
}
if input.Description != nil {
trimmed := strings.TrimSpace(*input.Description)
if trimmed == "" {
input.Description = nil
} else {
input.Description = &trimmed
}
}
input.RouteScope = defaultJSON(input.RouteScope, `{}`)
input.Policy = defaultJSON(input.Policy, `{}`)
input.Metadata = defaultJSON(input.Metadata, `{}`)
if !json.Valid(input.RouteScope) || !json.Valid(input.Policy) || !json.Valid(input.Metadata) {
return FabricEgressPool{}, errors.New("egress pool route_scope, policy, and metadata must be valid json")
}
item, err := s.store.CreateFabricEgressPool(ctx, input)
if err != nil {
return FabricEgressPool{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "fabric.egress_pool.created",
TargetType: "fabric_egress_pool",
TargetID: &item.ID,
Payload: json.RawMessage(`{"runtime_routing_enabled":false}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) SetFabricEgressPoolNode(ctx context.Context, input SetFabricEgressPoolNodeInput) (FabricEgressPoolNode, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return FabricEgressPoolNode{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return FabricEgressPoolNode{}, err
}
input.Status = strings.TrimSpace(input.Status)
if input.Status == "" {
input.Status = "active"
}
if input.Priority <= 0 {
input.Priority = 100
}
if input.ClusterID == "" || input.EgressPoolID == "" || input.NodeID == "" || !isFabricEndpointStatus(input.Status) {
return FabricEgressPoolNode{}, ErrInvalidPayload
}
input.Metadata = defaultJSON(input.Metadata, `{}`)
if !json.Valid(input.Metadata) {
return FabricEgressPoolNode{}, errors.New("egress pool node metadata must be valid json")
}
return s.store.SetFabricEgressPoolNode(ctx, input)
}
func (s *Service) ListFabricEgressPoolNodes(ctx context.Context, actorUserID, clusterID, egressPoolID string) ([]FabricEgressPoolNode, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
if clusterID == "" || egressPoolID == "" {
return nil, ErrInvalidPayload
}
return s.store.ListFabricEgressPoolNodes(ctx, clusterID, egressPoolID)
}
func (s *Service) GetClusterAuthorityState(ctx context.Context, actorUserID, clusterID string) (ClusterAuthorityState, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return ClusterAuthorityState{}, err
}
return s.store.GetClusterAuthorityState(ctx, clusterID)
}
func (s *Service) UpdateClusterAuthorityState(ctx context.Context, input UpdateClusterAuthorityInput) (ClusterAuthorityState, error) {
role, err := s.store.GetPlatformRole(ctx, strings.TrimSpace(input.ActorUserID))
if err != nil {
return ClusterAuthorityState{}, err
}
if !isPlatformAdminRole(role) {
return ClusterAuthorityState{}, ErrAccessDenied
}
if input.MutationMode == "recovery_override" && role != PlatformRoleRecoveryAdmin {
return ClusterAuthorityState{}, ErrAccessDenied
}
if input.AuthorityState == "" {
input.AuthorityState = "authoritative"
}
if input.MutationMode == "" {
input.MutationMode = "normal"
}
item, err := s.store.UpdateClusterAuthorityState(ctx, input)
if err != nil {
return ClusterAuthorityState{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "cluster_authority.updated",
TargetType: "cluster",
TargetID: &input.ClusterID,
Payload: json.RawMessage(`{"split_brain_guard":true}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) ListClusterAdminSummaries(ctx context.Context, actorUserID string) ([]ClusterAdminSummary, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListClusterAdminSummaries(ctx)
}
func (s *Service) CreateVPNConnection(ctx context.Context, input CreateVPNConnectionInput) (VPNConnection, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return VPNConnection{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return VPNConnection{}, err
}
input.Name = strings.TrimSpace(input.Name)
input.ProtocolFamily = strings.TrimSpace(input.ProtocolFamily)
if input.ProtocolFamily == "" {
input.ProtocolFamily = "generic"
}
input.Mode = strings.TrimSpace(input.Mode)
if input.Mode == "" {
input.Mode = VPNConnectionModeSingleActive
}
input.DesiredState = strings.TrimSpace(input.DesiredState)
if input.DesiredState == "" {
input.DesiredState = VPNConnectionDesiredDisabled
}
if input.ClusterID == "" || input.OrganizationID == "" || input.Name == "" {
return VPNConnection{}, ErrInvalidPayload
}
if input.Mode != VPNConnectionModeSingleActive {
return VPNConnection{}, errors.New("vpn connection mode must be single_active")
}
if !isAllowedVPNDesiredState(input.DesiredState) {
return VPNConnection{}, errors.New("vpn connection desired_state must be enabled or disabled")
}
input.TargetEndpoint = defaultJSON(input.TargetEndpoint, `{}`)
input.AllowedNodePolicy = defaultJSON(input.AllowedNodePolicy, `{"mode":"explicit","node_ids":[]}`)
input.RoutingUsage = defaultJSON(input.RoutingUsage, `[]`)
input.RoutePolicy = defaultJSON(input.RoutePolicy, `{}`)
input.QoSPolicy = defaultJSON(input.QoSPolicy, `{}`)
input.PlacementPolicy = defaultJSON(input.PlacementPolicy, `{}`)
input.Metadata = defaultJSON(input.Metadata, `{}`)
if !json.Valid(input.TargetEndpoint) ||
!json.Valid(input.AllowedNodePolicy) ||
!json.Valid(input.RoutingUsage) ||
!json.Valid(input.RoutePolicy) ||
!json.Valid(input.QoSPolicy) ||
!json.Valid(input.PlacementPolicy) ||
!json.Valid(input.Metadata) {
return VPNConnection{}, errors.New("vpn connection json fields must be valid json")
}
item, err := s.store.CreateVPNConnection(ctx, input)
if err != nil {
return VPNConnection{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "vpn_connection.created",
TargetType: "vpn_connection",
TargetID: &item.ID,
Payload: json.RawMessage(`{"runtime_created":false}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) ListVPNConnections(ctx context.Context, actorUserID, clusterID string) ([]VPNConnection, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListVPNConnections(ctx, clusterID)
}
func (s *Service) GetVPNConnection(ctx context.Context, actorUserID, clusterID, vpnConnectionID string) (VPNConnection, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return VPNConnection{}, err
}
item, err := s.store.GetVPNConnection(ctx, clusterID, vpnConnectionID)
if errors.Is(err, pgx.ErrNoRows) {
return VPNConnection{}, ErrInvalidVPNConnection
}
return item, err
}
func (s *Service) UpdateVPNConnectionDesiredState(ctx context.Context, input UpdateVPNConnectionDesiredStateInput) (VPNConnection, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return VPNConnection{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return VPNConnection{}, err
}
input.DesiredState = strings.TrimSpace(input.DesiredState)
if !isAllowedVPNDesiredState(input.DesiredState) {
return VPNConnection{}, errors.New("vpn connection desired_state must be enabled or disabled")
}
item, err := s.store.UpdateVPNConnectionDesiredState(ctx, input)
if errors.Is(err, pgx.ErrNoRows) {
return VPNConnection{}, ErrInvalidVPNConnection
}
if err != nil {
return VPNConnection{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "vpn_connection.desired_state_changed",
TargetType: "vpn_connection",
TargetID: &input.VPNConnectionID,
Payload: json.RawMessage(`{"runtime_executed":false}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) UpsertVPNConnectionRoutePolicy(ctx context.Context, input UpsertVPNConnectionRoutePolicyInput) (VPNConnectionRoutePolicy, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return VPNConnectionRoutePolicy{}, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return VPNConnectionRoutePolicy{}, err
}
input.RouteType = strings.TrimSpace(input.RouteType)
input.Destination = strings.TrimSpace(input.Destination)
input.Action = strings.TrimSpace(input.Action)
input.Status = strings.TrimSpace(input.Status)
if input.Action == "" {
input.Action = "allow"
}
if input.Status == "" {
input.Status = "active"
}
if input.Priority == 0 {
input.Priority = 100
}
if input.ClusterID == "" || input.VPNConnectionID == "" || input.RouteType == "" || input.Destination == "" {
return VPNConnectionRoutePolicy{}, ErrInvalidPayload
}
if !isAllowedVPNRouteType(input.RouteType) || !isAllowedVPNRouteAction(input.Action) || !isAllowedVPNPolicyStatus(input.Status) {
return VPNConnectionRoutePolicy{}, ErrInvalidPayload
}
input.Policy = defaultJSON(input.Policy, `{}`)
if !json.Valid(input.Policy) {
return VPNConnectionRoutePolicy{}, errors.New("vpn route policy json must be valid json")
}
item, err := s.store.UpsertVPNConnectionRoutePolicy(ctx, input)
if errors.Is(err, pgx.ErrNoRows) {
return VPNConnectionRoutePolicy{}, ErrInvalidVPNConnection
}
if err != nil {
return VPNConnectionRoutePolicy{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "vpn_connection.route_policy_changed",
TargetType: "vpn_connection",
TargetID: &input.VPNConnectionID,
Payload: json.RawMessage(`{"routing_runtime_changed":false}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) ListVPNConnectionRoutePolicies(ctx context.Context, actorUserID, clusterID, vpnConnectionID string) ([]VPNConnectionRoutePolicy, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListVPNConnectionRoutePolicies(ctx, clusterID, vpnConnectionID)
}
func (s *Service) SetVPNConnectionAllowedNodes(ctx context.Context, input SetVPNConnectionAllowedNodesInput) ([]VPNConnectionAllowedNode, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return nil, err
}
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
return nil, err
}
input.RolePreference = strings.TrimSpace(input.RolePreference)
if input.RolePreference == "" {
input.RolePreference = "candidate"
}
if input.ClusterID == "" || input.VPNConnectionID == "" {
return nil, ErrInvalidPayload
}
if !isAllowedVPNNodePreference(input.RolePreference) {
return nil, ErrInvalidPayload
}
input.Metadata = defaultJSON(input.Metadata, `{}`)
if !json.Valid(input.Metadata) {
return nil, errors.New("allowed node metadata must be valid json")
}
nodes := make([]string, 0, len(input.NodeIDs))
seen := map[string]struct{}{}
for _, nodeID := range input.NodeIDs {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" {
continue
}
if _, ok := seen[nodeID]; ok {
continue
}
seen[nodeID] = struct{}{}
nodes = append(nodes, nodeID)
}
input.NodeIDs = nodes
items, err := s.store.SetVPNConnectionAllowedNodes(ctx, input)
if errors.Is(err, pgx.ErrNoRows) {
return nil, ErrInvalidVPNConnection
}
if err != nil {
return nil, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "vpn_connection.allowed_nodes_changed",
TargetType: "vpn_connection",
TargetID: &input.VPNConnectionID,
Payload: json.RawMessage(`{"node_runtime_changed":false}`),
CreatedAt: s.now(),
})
return items, nil
}
func (s *Service) ListVPNConnectionAllowedNodes(ctx context.Context, actorUserID, clusterID, vpnConnectionID string) ([]VPNConnectionAllowedNode, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
return s.store.ListVPNConnectionAllowedNodes(ctx, clusterID, vpnConnectionID)
}
func (s *Service) AcquireVPNConnectionLease(ctx context.Context, input AcquireVPNConnectionLeaseInput) (VPNConnectionLease, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return VPNConnectionLease{}, err
}
if input.ClusterID == "" || input.VPNConnectionID == "" || input.OwnerNodeID == "" {
return VPNConnectionLease{}, ErrInvalidPayload
}
conn, err := s.store.GetVPNConnection(ctx, input.ClusterID, input.VPNConnectionID)
if errors.Is(err, pgx.ErrNoRows) {
return VPNConnectionLease{}, ErrInvalidVPNConnection
}
if err != nil {
return VPNConnectionLease{}, err
}
if conn.Mode != VPNConnectionModeSingleActive || conn.DesiredState != VPNConnectionDesiredEnabled {
return VPNConnectionLease{}, errors.New("vpn connection must be enabled single_active before lease acquisition")
}
if err := s.ensureVPNLeaseOwnerEligible(ctx, input.ClusterID, input.VPNConnectionID, input.OwnerNodeID); err != nil {
return VPNConnectionLease{}, err
}
if input.TTL <= 0 {
input.TTL = 30 * time.Second
}
input.Metadata = defaultJSON(input.Metadata, `{}`)
if !json.Valid(input.Metadata) {
return VPNConnectionLease{}, errors.New("lease metadata must be valid json")
}
token, err := generateFencingToken()
if err != nil {
return VPNConnectionLease{}, err
}
item, err := s.store.AcquireVPNConnectionLease(ctx, input, s.now().Add(input.TTL), token)
if errors.Is(err, pgx.ErrNoRows) {
return VPNConnectionLease{}, ErrInvalidVPNLease
}
if errors.Is(err, ErrVPNLeaseAlreadyActive) {
return VPNConnectionLease{}, ErrVPNLeaseAlreadyActive
}
if err != nil {
return VPNConnectionLease{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "vpn_connection.lease_acquired",
TargetType: "vpn_connection",
TargetID: &input.VPNConnectionID,
Payload: json.RawMessage(`{"vpn_runtime_started":false}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) RenewVPNConnectionLease(ctx context.Context, input RenewVPNConnectionLeaseInput) (VPNConnectionLease, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return VPNConnectionLease{}, err
}
if input.ClusterID == "" || input.VPNConnectionID == "" || input.LeaseID == "" || input.OwnerNodeID == "" || input.FencingToken == "" {
return VPNConnectionLease{}, ErrInvalidPayload
}
if input.TTL <= 0 {
input.TTL = 30 * time.Second
}
if err := s.ensureVPNLeaseOwnerEligible(ctx, input.ClusterID, input.VPNConnectionID, input.OwnerNodeID); err != nil {
return VPNConnectionLease{}, err
}
item, err := s.store.RenewVPNConnectionLease(ctx, input, s.now().Add(input.TTL))
if errors.Is(err, pgx.ErrNoRows) {
return VPNConnectionLease{}, ErrInvalidVPNLease
}
if err != nil {
return VPNConnectionLease{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "vpn_connection.lease_renewed",
TargetType: "vpn_connection",
TargetID: &input.VPNConnectionID,
Payload: json.RawMessage(`{"vpn_runtime_changed":false}`),
CreatedAt: s.now(),
})
return item, err
}
func (s *Service) RenewNodeVPNAssignmentLease(ctx context.Context, input RenewNodeVPNAssignmentLeaseInput) (VPNConnectionLease, error) {
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.VPNConnectionID = strings.TrimSpace(input.VPNConnectionID)
input.LeaseID = strings.TrimSpace(input.LeaseID)
input.OwnerNodeID = strings.TrimSpace(input.OwnerNodeID)
if input.ClusterID == "" || input.VPNConnectionID == "" || input.LeaseID == "" || input.OwnerNodeID == "" {
return VPNConnectionLease{}, ErrInvalidPayload
}
if input.TTL <= 0 {
input.TTL = 2 * time.Minute
}
if err := s.ensureVPNLeaseOwnerEligible(ctx, input.ClusterID, input.VPNConnectionID, input.OwnerNodeID); err != nil {
return VPNConnectionLease{}, err
}
assignments, err := s.store.ListNodeVPNAssignments(ctx, input.ClusterID, input.OwnerNodeID)
if err != nil {
return VPNConnectionLease{}, err
}
ownsVisibleLease := false
for _, assignment := range assignments {
if assignment.VPNConnectionID == input.VPNConnectionID &&
assignment.AssignmentReason == "active_owner" &&
assignment.ActiveLease != nil &&
assignment.ActiveLease.LeaseID == input.LeaseID &&
assignment.ActiveLease.OwnerNodeID == input.OwnerNodeID {
ownsVisibleLease = true
break
}
}
if !ownsVisibleLease {
return VPNConnectionLease{}, ErrVPNLeaseOwnerNotAllowed
}
item, err := s.store.RenewNodeVPNAssignmentLease(ctx, input, s.now().Add(input.TTL))
if errors.Is(err, pgx.ErrNoRows) {
return VPNConnectionLease{}, ErrInvalidVPNLease
}
if err != nil {
return VPNConnectionLease{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
EventType: "vpn_connection.lease_renewed_by_node",
TargetType: "vpn_connection",
TargetID: &input.VPNConnectionID,
Payload: json.RawMessage(`{"node_agent_runtime_executed":true}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) ReleaseVPNConnectionLease(ctx context.Context, input ReleaseVPNConnectionLeaseInput) (VPNConnectionLease, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return VPNConnectionLease{}, err
}
if input.ClusterID == "" || input.VPNConnectionID == "" || input.LeaseID == "" || input.OwnerNodeID == "" || input.FencingToken == "" {
return VPNConnectionLease{}, ErrInvalidPayload
}
item, err := s.store.ReleaseVPNConnectionLease(ctx, input)
if errors.Is(err, pgx.ErrNoRows) {
return VPNConnectionLease{}, ErrInvalidVPNLease
}
if err != nil {
return VPNConnectionLease{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "vpn_connection.lease_released",
TargetType: "vpn_connection",
TargetID: &input.VPNConnectionID,
Payload: json.RawMessage(`{"vpn_runtime_stopped":false}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) FenceVPNConnectionLease(ctx context.Context, input FenceVPNConnectionLeaseInput) (VPNConnectionLease, error) {
if err := s.ensurePlatformRecoveryAdmin(ctx, input.ActorUserID); err != nil {
return VPNConnectionLease{}, err
}
input.Reason = strings.TrimSpace(input.Reason)
if input.Reason == "" {
input.Reason = "fenced by platform recovery administrator"
}
if input.ClusterID == "" || input.VPNConnectionID == "" || input.LeaseID == "" {
return VPNConnectionLease{}, ErrInvalidPayload
}
item, err := s.store.FenceVPNConnectionLease(ctx, input)
if errors.Is(err, pgx.ErrNoRows) {
return VPNConnectionLease{}, ErrInvalidVPNLease
}
if err != nil {
return VPNConnectionLease{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "vpn_connection.owner_fenced",
TargetType: "vpn_connection",
TargetID: &input.VPNConnectionID,
Payload: json.RawMessage(`{"split_brain_guard":true}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) GetActiveVPNConnectionLease(ctx context.Context, actorUserID, clusterID, vpnConnectionID string) (VPNConnectionLease, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return VPNConnectionLease{}, err
}
item, err := s.store.GetActiveVPNConnectionLease(ctx, clusterID, vpnConnectionID)
if errors.Is(err, pgx.ErrNoRows) {
return VPNConnectionLease{}, ErrInvalidVPNLease
}
return item, err
}
func (s *Service) ExpireStaleVPNConnectionLeases(ctx context.Context, input ExpireStaleVPNConnectionLeasesInput) ([]VPNConnectionLease, error) {
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
return nil, err
}
if input.ClusterID == "" {
return nil, ErrInvalidPayload
}
items, err := s.store.ExpireStaleVPNConnectionLeases(ctx, input.ClusterID, s.now())
if err != nil {
return nil, err
}
for _, item := range items {
vpnConnectionID := item.VPNConnectionID
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
ActorUserID: &input.ActorUserID,
EventType: "vpn_connection.lease_expired",
TargetType: "vpn_connection",
TargetID: &vpnConnectionID,
Payload: json.RawMessage(`{"stale_reclamation":true,"vpn_runtime_changed":false}`),
CreatedAt: s.now(),
})
}
return items, nil
}
func (s *Service) ListNodeVPNAssignments(ctx context.Context, clusterID, nodeID string) ([]NodeVPNAssignment, error) {
clusterID = strings.TrimSpace(clusterID)
nodeID = strings.TrimSpace(nodeID)
if clusterID == "" || nodeID == "" {
return nil, ErrInvalidPayload
}
return s.store.ListNodeVPNAssignments(ctx, clusterID, nodeID)
}
func (s *Service) ReportNodeVPNAssignmentStatus(ctx context.Context, input ReportNodeVPNAssignmentStatusInput) (NodeVPNAssignmentStatus, error) {
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.NodeID = strings.TrimSpace(input.NodeID)
input.VPNConnectionID = strings.TrimSpace(input.VPNConnectionID)
input.ObservedStatus = strings.TrimSpace(input.ObservedStatus)
if input.ClusterID == "" || input.NodeID == "" || input.VPNConnectionID == "" {
return NodeVPNAssignmentStatus{}, ErrInvalidPayload
}
if input.ObservedStatus == "" {
input.ObservedStatus = VPNAssignmentStatusUnknown
}
if !isAllowedVPNAssignmentStatus(input.ObservedStatus) {
return NodeVPNAssignmentStatus{}, ErrInvalidPayload
}
input.StatusPayload = defaultJSON(input.StatusPayload, `{}`)
if !json.Valid(input.StatusPayload) {
return NodeVPNAssignmentStatus{}, errors.New("status_payload must be valid json")
}
if input.ObservedAt.IsZero() {
input.ObservedAt = s.now()
}
assignments, err := s.store.ListNodeVPNAssignments(ctx, input.ClusterID, input.NodeID)
if err != nil {
return NodeVPNAssignmentStatus{}, err
}
visible := false
for _, assignment := range assignments {
if assignment.VPNConnectionID == input.VPNConnectionID {
visible = true
break
}
}
if !visible {
return NodeVPNAssignmentStatus{}, ErrVPNLeaseOwnerNotAllowed
}
item, err := s.store.ReportNodeVPNAssignmentStatus(ctx, input)
if err != nil {
return NodeVPNAssignmentStatus{}, err
}
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
ClusterID: &input.ClusterID,
EventType: "vpn_connection.assignment_status_reported",
TargetType: "vpn_connection",
TargetID: &input.VPNConnectionID,
Payload: json.RawMessage(`{"node_agent_runtime_executed":false}`),
CreatedAt: s.now(),
})
return item, nil
}
func (s *Service) GetVPNClientProfile(
ctx context.Context,
clusterID, organizationID, userID string,
preferredEntryNodeID ...string,
) (VPNClientProfile, error) {
clusterID = strings.TrimSpace(clusterID)
organizationID = strings.TrimSpace(organizationID)
userID = strings.TrimSpace(userID)
if clusterID == "" || organizationID == "" || userID == "" {
return VPNClientProfile{}, ErrInvalidPayload
}
preferredEntry := ""
if len(preferredEntryNodeID) > 0 {
preferredEntry = strings.TrimSpace(preferredEntryNodeID[0])
}
preferredExit := ""
if len(preferredEntryNodeID) > 1 {
preferredExit = strings.TrimSpace(preferredEntryNodeID[1])
}
profile, err := s.store.GetVPNClientProfile(ctx, clusterID, organizationID, userID, preferredEntry, preferredExit, s.now().UTC())
if err != nil {
return VPNClientProfile{}, err
}
if profile.ClusterID == "" {
profile.ClusterID = clusterID
}
if profile.OrganizationID == "" {
profile.OrganizationID = organizationID
}
if profile.UserID == "" {
profile.UserID = userID
}
profile = attachVPNDataplaneSessions(profile, s.now().UTC())
if err := s.ensureVPNFabricRouteIntents(ctx, clusterID, profile); err != nil {
return VPNClientProfile{}, err
}
profile = s.attachVPNFabricServiceChannelLeases(ctx, profile)
return profile, nil
}
func (s *Service) attachVPNFabricServiceChannelLeases(ctx context.Context, profile VPNClientProfile) VPNClientProfile {
for i := range profile.Connections {
connection := profile.Connections[i]
route := vpnFabricRouteFromClientConfig(connection.ClientConfig)
if route.Status != "planned" || route.SelectedEntryNodeID == "" || route.SelectedExitNodeID == "" {
continue
}
entryPool := dedupeStrings(append([]string{}, route.EntryPoolNodeIDs...))
if len(entryPool) == 0 {
entryPool = dedupeStrings(append([]string{route.SelectedEntryNodeID}, connection.EntryNodeIDs...))
}
exitPool := dedupeStrings(append([]string{}, route.ExitPoolNodeIDs...))
if len(exitPool) == 0 {
exitPool = dedupeStrings(append([]string{route.SelectedExitNodeID, connection.ExitNodeID}, connection.AllowedNodeIDs...))
}
lease, err := s.IssueFabricServiceChannelLease(ctx, IssueFabricServiceChannelLeaseInput{
ClusterID: profile.ClusterID,
OrganizationID: profile.OrganizationID,
UserID: profile.UserID,
ResourceID: connection.ID,
ServiceClass: FabricServiceClassVPNPackets,
EntryNodeIDs: entryPool,
ExitNodeIDs: exitPool,
PreferredEntryNodeID: route.SelectedEntryNodeID,
PreferredExitNodeID: route.SelectedExitNodeID,
AllowedChannels: []string{"vpn_packet", "fabric_control", FabricChannelBulk, FabricChannelControl},
TTL: time.Minute,
})
if err != nil {
profile.Connections[i].ClientConfig = attachVPNFabricServiceChannelError(connection.ClientConfig, err)
continue
}
profile.Connections[i].ClientConfig = attachVPNFabricServiceChannelLease(connection.ClientConfig, lease)
}
return profile
}
func attachVPNFabricServiceChannelLease(raw json.RawMessage, lease FabricServiceChannelLease) json.RawMessage {
var cfg map[string]any
if err := json.Unmarshal(raw, &cfg); err != nil || cfg == nil {
cfg = map[string]any{}
}
cfg["fabric_service_channel_lease"] = lease
cfg["fabric_service_channel_status"] = lease.Status
out, err := json.Marshal(cfg)
if err != nil {
return raw
}
return out
}
func attachVPNFabricServiceChannelError(raw json.RawMessage, err error) json.RawMessage {
var cfg map[string]any
if json.Unmarshal(raw, &cfg) != nil || cfg == nil {
cfg = map[string]any{}
}
cfg["fabric_service_channel_status"] = "error"
cfg["fabric_service_channel_error"] = err.Error()
out, marshalErr := json.Marshal(cfg)
if marshalErr != nil {
return raw
}
return out
}
func attachVPNDataplaneSessions(profile VPNClientProfile, now time.Time) VPNClientProfile {
for i := range profile.Connections {
profile.Connections[i].ClientConfig = enrichVPNDataplaneSession(profile, profile.Connections[i], now)
}
return profile
}
func enrichVPNDataplaneSession(profile VPNClientProfile, connection VPNClientConnection, now time.Time) json.RawMessage {
var cfg map[string]any
if err := json.Unmarshal(connection.ClientConfig, &cfg); err != nil || cfg == nil {
cfg = map[string]any{}
}
route := vpnFabricRouteFromClientConfig(connection.ClientConfig)
expiresAt := now.Add(time.Minute)
sessionID := uuidLikeRandom()
if sessionID == "" {
sessionID = "vpn-session-" + now.UTC().Format("20060102T150405.000000000Z")
}
entryCandidates := vpnDataplaneEntryCandidates(route, connection, cfg)
transportCandidates := vpnDataplaneTransportCandidates(route, entryCandidates)
status := "waiting_for_entry_endpoint"
if route.Status == "planned" && route.SelectedEntryNodeID != "" && route.SelectedExitNodeID != "" {
status = "ready_for_entry_listener"
}
cfg["vpn_dataplane_session"] = map[string]any{
"schema_version": "rap.vpn_dataplane_session.v1",
"session_id": sessionID,
"status": status,
"issued_at": now,
"expires_at": expiresAt,
"cluster_id": profile.ClusterID,
"organization_id": profile.OrganizationID,
"user_id": profile.UserID,
"vpn_connection_id": connection.ID,
"entry_node_id": route.SelectedEntryNodeID,
"exit_node_id": route.SelectedExitNodeID,
"preferred_transport": "fabric_packet_quic_v1",
"fallback_transport": "backend_http_packet_relay",
"packet_contract": map[string]any{
"tunnel_type": "universal_ip_packet",
"application_protocol_agnostic": true,
"all_ip_traffic": true,
"protocol_specific_routing": false,
},
"auth": map[string]any{
"type": "control_plane_issued_bearer",
"token": "rap_vpn_dps_" + sessionID,
"token_ttl_seconds": int(expiresAt.Sub(now).Seconds()),
"node_validation": "entry_node_calls_control_plane_introspection",
"introspection_path": "/api/v1/clusters/{cluster_id}/vpn/dataplane-sessions/{session_id}/introspect",
},
"entry_candidates": entryCandidates,
"transport_candidates": transportCandidates,
}
out, err := json.Marshal(cfg)
if err != nil {
return connection.ClientConfig
}
return out
}
func vpnDataplaneEntryCandidates(route vpnClientFabricRoute, connection VPNClientConnection, cfg map[string]any) []map[string]any {
concrete := vpnConcreteEntryCandidatesFromClientConfig(cfg)
ids := dedupeStrings(append([]string{route.SelectedEntryNodeID}, connection.EntryNodeIDs...))
out := make([]map[string]any, 0, len(concrete)+len(ids))
nodesWithConcrete := map[string]struct{}{}
for _, candidate := range concrete {
nodeID, _ := candidate["node_id"].(string)
if nodeID == "" {
continue
}
nodesWithConcrete[nodeID] = struct{}{}
enriched := make(map[string]any, len(candidate)+4)
for k, v := range candidate {
enriched[k] = v
}
status := "endpoint_reported"
if nodeID == route.SelectedEntryNodeID {
status = "selected_endpoint_reported"
}
reachability, _ := enriched["reachability"].(string)
if nodeID == route.SelectedEntryNodeID && strings.EqualFold(reachability, "public") {
status = "selected_endpoint_public"
}
enriched["status"] = status
enriched["endpoint_source"] = "node_latest_heartbeat.mesh_endpoint_report"
enriched["transports"] = []string{"entry_direct_http_v1", "fabric_packet_quic_v1", "fabric_packet_tcp_v1"}
out = append(out, enriched)
}
for _, nodeID := range ids {
if nodeID == "" {
continue
}
if _, ok := nodesWithConcrete[nodeID]; ok {
continue
}
status := "endpoint_pending"
if nodeID == route.SelectedEntryNodeID {
status = "selected_endpoint_pending"
}
out = append(out, map[string]any{
"node_id": nodeID,
"status": status,
"transports": []string{"fabric_packet_quic_v1", "fabric_packet_tcp_v1"},
"endpoint_source": "node_mesh_advertisement_pending",
})
}
return out
}
func vpnConcreteEntryCandidatesFromClientConfig(cfg map[string]any) []map[string]any {
raw, ok := cfg["vpn_entry_endpoint_candidates"]
if !ok {
return nil
}
payload, err := json.Marshal(raw)
if err != nil {
return nil
}
var out []map[string]any
if err := json.Unmarshal(payload, &out); err != nil {
return nil
}
return out
}
func vpnDataplaneTransportCandidates(route vpnClientFabricRoute, entryCandidates []map[string]any) []map[string]any {
candidates := []map[string]any{
{
"type": "fabric_packet_quic_v1",
"status": "contract_ready_listener_pending",
"entry_node_id": route.SelectedEntryNodeID,
"exit_node_id": route.SelectedExitNodeID,
"entry_candidates": entryCandidates,
"application_protocols": []string{"ip"},
},
}
if direct := vpnDirectHTTPEntryTransportCandidate(route, entryCandidates); direct != nil {
candidates = append(candidates, direct)
}
candidates = append(candidates, map[string]any{
"type": "backend_http_packet_relay",
"status": "active_fallback",
"description": "current safe dataplane until entry listener is available",
})
return candidates
}
func vpnDirectHTTPEntryTransportCandidate(route vpnClientFabricRoute, entryCandidates []map[string]any) map[string]any {
var selected []map[string]any
hasPublic := false
hasHTTP := false
hasLocalGatewayShortcut := false
for _, candidate := range entryCandidates {
nodeID, _ := candidate["node_id"].(string)
if route.SelectedEntryNodeID != "" && nodeID != route.SelectedEntryNodeID {
continue
}
apiBaseURL, _ := candidate["api_base_url"].(string)
address, _ := candidate["address"].(string)
if apiBaseURL == "" && (strings.HasPrefix(address, "http://") || strings.HasPrefix(address, "https://")) {
apiBaseURL = strings.TrimRight(address, "/") + "/api/v1"
candidate["api_base_url"] = apiBaseURL
}
if apiBaseURL == "" {
continue
}
hasHTTP = true
reachability, _ := candidate["reachability"].(string)
if strings.EqualFold(reachability, "public") {
hasPublic = true
}
if value, ok := candidate["local_gateway_shortcut"].(bool); ok && value {
hasLocalGatewayShortcut = true
}
selected = append(selected, candidate)
}
if len(selected) == 0 {
return nil
}
status := "reported_private_or_unverified"
if hasPublic {
status = "available"
} else if hasHTTP {
status = "http_endpoint_reported_unverified"
}
safeClientSwitch := hasPublic
if route.SelectedEntryNodeID != "" && route.SelectedEntryNodeID == route.SelectedExitNodeID {
if hasPublic && hasLocalGatewayShortcut {
status = "available_local_gateway_shortcut"
safeClientSwitch = true
} else {
status = "available_local_gateway_shortcut_pending"
safeClientSwitch = false
}
}
return map[string]any{
"type": "entry_direct_http_v1",
"status": status,
"entry_node_id": route.SelectedEntryNodeID,
"exit_node_id": route.SelectedExitNodeID,
"entry_candidates": selected,
"application_protocols": []string{"ip"},
"safe_client_switch": safeClientSwitch,
}
}
func uuidLikeRandom() string {
var raw [16]byte
if _, err := rand.Read(raw[:]); err != nil {
return ""
}
raw[6] = (raw[6] & 0x0f) | 0x40
raw[8] = (raw[8] & 0x3f) | 0x80
encoded := hex.EncodeToString(raw[:])
return encoded[0:8] + "-" + encoded[8:12] + "-" + encoded[12:16] + "-" + encoded[16:20] + "-" + encoded[20:32]
}
func (s *Service) ensureVPNFabricRouteIntents(ctx context.Context, clusterID string, profile VPNClientProfile) error {
intents, err := s.store.ListRouteIntents(ctx, clusterID)
if err != nil {
return err
}
existing := map[string]bool{}
for _, intent := range intents {
source, destination, ok := activeVPNPacketRouteIntent(intent, s.now())
if !ok {
continue
}
existing[source+"->"+destination] = true
}
for _, connection := range profile.Connections {
route := vpnFabricRouteFromClientConfig(connection.ClientConfig)
if route.Status != "planned" || route.SelectedEntryNodeID == "" || route.SelectedExitNodeID == "" || route.SelectedEntryNodeID == route.SelectedExitNodeID {
continue
}
pairs := [][2]string{
{route.SelectedEntryNodeID, route.SelectedExitNodeID},
{route.SelectedExitNodeID, route.SelectedEntryNodeID},
}
for _, pair := range pairs {
key := pair[0] + "->" + pair[1]
if existing[key] {
continue
}
if _, err := s.store.CreateRouteIntent(ctx, CreateRouteIntentInput{
ClusterID: clusterID,
SourceSelector: mustJSONRaw(map[string]any{"node_id": pair[0]}),
DestinationSelector: mustJSONRaw(map[string]any{"node_id": pair[1]}),
ServiceClass: "vpn_packets",
Priority: 10,
Policy: mustJSONRaw(vpnFabricRouteIntentPolicy(pair[0], pair[1], s.now().UTC().Add(30*24*time.Hour))),
}); err != nil {
return err
}
existing[key] = true
}
}
return nil
}
type vpnClientFabricRoute struct {
Status string `json:"status"`
SelectedEntryNodeID string `json:"selected_entry_node_id"`
SelectedExitNodeID string `json:"selected_exit_node_id"`
EntryPoolNodeIDs []string `json:"entry_pool_node_ids"`
ExitPoolNodeIDs []string `json:"exit_pool_node_ids"`
}
func vpnFabricRouteFromClientConfig(raw json.RawMessage) vpnClientFabricRoute {
var cfg struct {
Route vpnClientFabricRoute `json:"vpn_fabric_route"`
}
if len(raw) == 0 {
return vpnClientFabricRoute{}
}
_ = json.Unmarshal(raw, &cfg)
return cfg.Route
}
func activeVPNPacketRouteIntent(intent MeshRouteIntent, now time.Time) (string, string, bool) {
if intent.Status != "active" || intent.ServiceClass != "vpn_packets" {
return "", "", false
}
var policy syntheticRoutePolicy
if err := json.Unmarshal(intent.Policy, &policy); err != nil || !containsString(policy.AllowedChannels, "vpn_packet") {
return "", "", false
}
if policy.ExpiresAt != nil && !policy.ExpiresAt.After(now.UTC()) {
return "", "", false
}
var source nodeSelector
var destination nodeSelector
_ = json.Unmarshal(intent.SourceSelector, &source)
_ = json.Unmarshal(intent.DestinationSelector, &destination)
sourceNodeID := firstNodeID(source)
destinationNodeID := firstNodeID(destination)
if sourceNodeID == "" || destinationNodeID == "" {
return "", "", false
}
return sourceNodeID, destinationNodeID, true
}
func vpnFabricRouteIntentPolicy(sourceNodeID, destinationNodeID string, expiresAt time.Time) map[string]any {
version := "vpn-fabric-" + expiresAt.UTC().Format("20060102T150405Z")
return map[string]any{
"synthetic_enabled": true,
"hops": []string{sourceNodeID, destinationNodeID},
"allowed_channels": []string{"vpn_packet", "fabric_control"},
"max_ttl": 8,
"max_hops": 8,
"expires_at": expiresAt.UTC().Format(time.RFC3339),
"route_version": version,
"policy_version": version,
"peer_directory_version": version,
"backend_relay_fallback": true,
"data_plane_preference": "fabric_mesh",
"route_owner": "vpn_client_profile",
"route_refresh_required": true,
"route_refresh_threshold": "24h",
}
}
func mustJSONRaw(value any) json.RawMessage {
raw, err := json.Marshal(value)
if err != nil {
return json.RawMessage(`{}`)
}
return raw
}
func (s *Service) ListAuditEvents(ctx context.Context, actorUserID string, input ListAuditEventsInput) ([]ClusterAuditEvent, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return nil, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
input.EventTypes = trimStringSlice(input.EventTypes)
input.TargetTypes = trimStringSlice(input.TargetTypes)
input.Correlation = strings.TrimSpace(input.Correlation)
events, err := s.store.ListAuditEvents(ctx, input)
if err != nil {
return nil, err
}
if input.Correlation == "fabric_diagnostics" {
events = s.withFabricDiagnosticsAuditCorrelation(ctx, actorUserID, input.ClusterID, events)
}
return events, nil
}
func (s *Service) ListFabricServiceChannelRebuildInvestigationBreadcrumbs(ctx context.Context, actorUserID string, input ListFabricServiceChannelRebuildInvestigationBreadcrumbsInput) (FabricServiceChannelRebuildInvestigationBreadcrumbs, error) {
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
return FabricServiceChannelRebuildInvestigationBreadcrumbs{}, err
}
input.ClusterID = strings.TrimSpace(input.ClusterID)
if input.ClusterID == "" {
return FabricServiceChannelRebuildInvestigationBreadcrumbs{}, ErrInvalidPayload
}
if input.Limit <= 0 || input.Limit > 100 {
input.Limit = 20
}
cluster, err := s.store.GetCluster(ctx, input.ClusterID)
if errors.Is(err, pgx.ErrNoRows) {
return FabricServiceChannelRebuildInvestigationBreadcrumbs{}, ErrInvalidCluster
}
if err != nil {
return FabricServiceChannelRebuildInvestigationBreadcrumbs{}, err
}
windowPolicy := fabricServiceChannelBreadcrumbWindowPolicyFromCluster(cluster)
if input.CurrentWindowSeconds <= 0 {
input.CurrentWindowSeconds = windowPolicy.CurrentWindowSeconds
}
if input.HistoryWindowSeconds <= 0 {
input.HistoryWindowSeconds = windowPolicy.HistoryWindowSeconds
}
if input.HistoryWindowSeconds < input.CurrentWindowSeconds {
input.HistoryWindowSeconds = input.CurrentWindowSeconds
}
events, err := s.ListAuditEvents(ctx, actorUserID, ListAuditEventsInput{
ClusterID: input.ClusterID,
EventTypes: []string{
"fabric.service_channel_rebuild_feedback_breakdown.investigation_opened",
"fabric.service_channel_rebuild_incident.investigation_opened",
},
Correlation: "fabric_diagnostics",
Limit: input.Limit,
})
if err != nil {
return FabricServiceChannelRebuildInvestigationBreadcrumbs{}, err
}
events = withFabricDiagnosticsBreadcrumbFreshness(events, s.now(), input.CurrentWindowSeconds, input.HistoryWindowSeconds)
summary := summarizeClusterAuditEvents(events)
return FabricServiceChannelRebuildInvestigationBreadcrumbs{
ClusterID: input.ClusterID,
Events: events,
Summary: summary,
CurrentWindowSeconds: input.CurrentWindowSeconds,
HistoryWindowSeconds: input.HistoryWindowSeconds,
CurrentCount: summary.CountsByBreadcrumbStatus["current"],
StaleCount: summary.CountsByBreadcrumbStatus["stale"],
ExpiredCount: summary.CountsByBreadcrumbStatus["expired"],
}, nil
}
func withFabricDiagnosticsBreadcrumbFreshness(events []ClusterAuditEvent, now time.Time, currentWindowSeconds, historyWindowSeconds int64) []ClusterAuditEvent {
if len(events) == 0 {
return events
}
if now.IsZero() {
now = time.Now().UTC()
}
for index := range events {
if events[index].CorrelationHints == nil {
events[index].CorrelationHints = &ClusterAuditCorrelationHints{Scope: "fabric_diagnostics"}
}
ageSeconds := int64(0)
if !events[index].CreatedAt.IsZero() {
ageSeconds = int64(now.Sub(events[index].CreatedAt).Seconds())
if ageSeconds < 0 {
ageSeconds = 0
}
}
status := "current"
if ageSeconds > historyWindowSeconds {
status = "expired"
} else if ageSeconds > currentWindowSeconds {
status = "stale"
}
events[index].CorrelationHints.BreadcrumbStatus = status
events[index].CorrelationHints.BreadcrumbAgeSeconds = ageSeconds
events[index].CorrelationHints.BreadcrumbCurrentWindow = currentWindowSeconds
events[index].CorrelationHints.BreadcrumbHistoryWindow = historyWindowSeconds
}
return events
}
func (s *Service) withFabricDiagnosticsAuditCorrelation(ctx context.Context, actorUserID, clusterID string, events []ClusterAuditEvent) []ClusterAuditEvent {
if len(events) == 0 {
return events
}
health, healthErr := s.GetFabricServiceChannelRouteRebuildHealthSummary(ctx, actorUserID, GetFabricServiceChannelRouteRebuildHealthSummaryInput{
ClusterID: clusterID,
Limit: 5,
})
incidents, incidentsErr := s.ListFabricServiceChannelRouteRebuildIncidents(ctx, actorUserID, ListFabricServiceChannelRouteRebuildIncidentsInput{
ClusterID: clusterID,
Limit: 20,
})
for index := range events {
hints := ClusterAuditCorrelationHints{
Scope: "fabric_diagnostics",
CurrentDiagnosticStatus: "not_visible",
}
if healthErr == nil {
if breakdown := fabricAuditMatchingFeedbackBreakdown(events[index], health.FeedbackBreakdowns); breakdown != nil {
hints.CurrentDiagnosticStatus = "breakdown_active"
hints.FeedbackBreakdown = breakdown
hints.RecommendedAction = "open_filtered_rebuild_ledger"
}
}
if hints.FeedbackBreakdown == nil && incidentsErr == nil {
if incident := fabricAuditMatchingRebuildIncident(events[index], incidents); incident != nil {
hints.CurrentDiagnosticStatus = "incident_visible"
hints.RebuildIncident = incident
hints.RecommendedAction = "open_deep_rebuild_ledger"
}
}
events[index].CorrelationHints = &hints
}
return events
}
func fabricAuditMatchingFeedbackBreakdown(event ClusterAuditEvent, breakdowns []FabricServiceChannelRouteRebuildFeedbackHealthBreakdown) *FabricServiceChannelRouteRebuildFeedbackHealthBreakdown {
payload := jsonObject(event.Payload)
feedbackSource := jsonString(payload, "feedback_source")
feedbackChannelID := jsonString(payload, "feedback_channel_id")
feedbackViolationStatus := jsonString(payload, "feedback_violation_status")
reporterNodeID := jsonString(payload, "reporter_node_id")
routeID := jsonString(payload, "route_id")
if feedbackSource == "" && feedbackChannelID == "" && feedbackViolationStatus == "" {
return nil
}
for index := range breakdowns {
item := breakdowns[index]
if feedbackSource != "" && item.FeedbackSource != feedbackSource {
continue
}
if feedbackChannelID != "" && item.FeedbackChannelID != feedbackChannelID {
continue
}
if feedbackViolationStatus != "" && item.FeedbackViolationStatus != feedbackViolationStatus {
continue
}
if reporterNodeID != "" && !containsString(item.AffectedReporterNodeIDs, reporterNodeID) {
continue
}
if routeID != "" && !containsString(item.AffectedRouteIDs, routeID) {
continue
}
return &item
}
return nil
}
func fabricAuditMatchingRebuildIncident(event ClusterAuditEvent, incidents []FabricServiceChannelRouteRebuildIncident) *FabricServiceChannelRouteRebuildIncident {
payload := jsonObject(event.Payload)
reporterNodeID := jsonString(payload, "reporter_node_id")
routeID := jsonString(payload, "route_id")
if routeID == "" && event.TargetType == "fabric_service_channel_route_rebuild_incident" && event.TargetID != nil {
routeID = *event.TargetID
}
serviceClass := jsonString(payload, "service_class")
generation := jsonString(payload, "generation")
guardStatus := jsonString(payload, "guard_status")
for index := range incidents {
item := incidents[index]
if reporterNodeID != "" && item.ReporterNodeID != reporterNodeID {
continue
}
if routeID != "" && item.RouteID != routeID {
continue
}
if serviceClass != "" && item.ServiceClass != serviceClass {
continue
}
if generation != "" && item.Generation != generation {
continue
}
if guardStatus != "" && item.GuardStatus != guardStatus {
continue
}
if reporterNodeID == "" && routeID == "" && serviceClass == "" && generation == "" && guardStatus == "" {
continue
}
return &item
}
return nil
}
func summarizeClusterAuditEvents(events []ClusterAuditEvent) ClusterAuditSummary {
summary := ClusterAuditSummary{
TotalCount: len(events),
CountsByEventType: map[string]int{},
CountsByTargetType: map[string]int{},
CountsByCurrentDiagnosticStatus: map[string]int{},
CountsByFeedbackSource: map[string]int{},
CountsByFeedbackViolationStatus: map[string]int{},
CountsByBreadcrumbStatus: map[string]int{},
}
for _, event := range events {
if event.EventType != "" {
summary.CountsByEventType[event.EventType]++
}
if event.TargetType != "" {
summary.CountsByTargetType[event.TargetType]++
}
if event.CreatedAt.After(summary.LatestAt) {
summary.LatestAt = event.CreatedAt.UTC()
}
payload := jsonObject(event.Payload)
if source := jsonString(payload, "feedback_source"); source != "" {
summary.CountsByFeedbackSource[source]++
}
if status := jsonString(payload, "feedback_violation_status"); status != "" {
summary.CountsByFeedbackViolationStatus[status]++
}
if event.CorrelationHints == nil {
continue
}
if breadcrumbStatus := strings.TrimSpace(event.CorrelationHints.BreadcrumbStatus); breadcrumbStatus != "" {
summary.CountsByBreadcrumbStatus[breadcrumbStatus]++
}
status := firstNonEmptyString(event.CorrelationHints.CurrentDiagnosticStatus, "unknown")
summary.CountsByCurrentDiagnosticStatus[status]++
if status == "not_visible" {
summary.NotVisibleCount++
} else {
summary.CorrelatedCount++
}
}
return summary
}
func (s *Service) ensurePlatformAdmin(ctx context.Context, userID string) error {
userID = strings.TrimSpace(userID)
if userID == "" {
return ErrAccessDenied
}
role, err := s.store.GetPlatformRole(ctx, userID)
if err != nil {
return err
}
if !isPlatformAdminRole(role) {
return ErrAccessDenied
}
return nil
}
func (s *Service) ensurePlatformRecoveryAdmin(ctx context.Context, userID string) error {
userID = strings.TrimSpace(userID)
if userID == "" {
return ErrAccessDenied
}
role, err := s.store.GetPlatformRole(ctx, userID)
if err != nil {
return err
}
if role != PlatformRoleRecoveryAdmin {
return ErrAccessDenied
}
return nil
}
func (s *Service) ensureClusterMutable(ctx context.Context, actorUserID, clusterID string) error {
role, err := s.store.GetPlatformRole(ctx, strings.TrimSpace(actorUserID))
if err != nil {
return err
}
if role == PlatformRoleRecoveryAdmin {
return nil
}
state, err := s.store.GetClusterAuthorityState(ctx, clusterID)
if err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return nil
}
return err
}
if state.AuthorityState != "authoritative" || state.MutationMode != "normal" {
return ErrClusterReadOnly
}
return nil
}
func (s *Service) ensureVPNLeaseOwnerEligible(ctx context.Context, clusterID, vpnConnectionID, ownerNodeID string) error {
eligibility, err := s.store.CheckVPNLeaseOwnerEligibility(ctx, clusterID, vpnConnectionID, ownerNodeID)
if errors.Is(err, pgx.ErrNoRows) {
return ErrInvalidVPNConnection
}
if err != nil {
return err
}
if eligibility.MembershipStatus != "active" || eligibility.NodeRegistrationStatus != NodeRegistrationActive {
return ErrVPNLeaseOwnerNotAllowed
}
if !eligibility.AllowedByPolicy {
return ErrVPNLeaseOwnerNotAllowed
}
if !eligibility.HasAuthorizedRole {
return ErrVPNLeaseOwnerRoleRequired
}
return nil
}
func defaultJSON(raw json.RawMessage, fallback string) json.RawMessage {
if len(raw) == 0 {
return json.RawMessage(fallback)
}
return raw
}
func isAllowedVPNDesiredState(state string) bool {
return state == VPNConnectionDesiredEnabled || state == VPNConnectionDesiredDisabled
}
func isAllowedVPNRouteType(routeType string) bool {
switch routeType {
case "cidr", "dns_suffix", "service", "resource":
return true
default:
return false
}
}
func isAllowedVPNRouteAction(action string) bool {
return action == "allow" || action == "deny"
}
func isAllowedVPNPolicyStatus(status string) bool {
return status == "active" || status == "disabled"
}
func isFabricEndpointStatus(status string) bool {
switch status {
case "active", "disabled", "maintenance":
return true
default:
return false
}
}
func isFabricEntryPointType(endpointType string) bool {
switch endpointType {
case "client_access", "admin", "api", "other":
return true
default:
return false
}
}
func isAllowedVPNNodePreference(preference string) bool {
switch preference {
case "candidate", "standby", "preferred":
return true
default:
return false
}
}
func isAllowedVPNAssignmentStatus(status string) bool {
switch status {
case VPNAssignmentStatusNotStarted,
VPNAssignmentStatusAssigned,
VPNAssignmentStatusLeaseRequired,
VPNAssignmentStatusBlocked,
VPNAssignmentStatusUnknown:
return true
default:
return false
}
}
type syntheticRoutePolicy struct {
SyntheticEnabled bool `json:"synthetic_enabled"`
PeerEndpoints map[string]string `json:"peer_endpoints"`
PeerEndpointCandidates map[string][]PeerEndpointCandidate `json:"peer_endpoint_candidates"`
RecoverySeeds []PeerRecoverySeed `json:"recovery_seeds"`
RendezvousLeases []PeerRendezvousLease `json:"rendezvous_leases"`
Hops []string `json:"hops"`
AllowedChannels []string `json:"allowed_channels"`
MaxTTL int `json:"max_ttl"`
MaxHops int `json:"max_hops"`
ExpiresAt *time.Time `json:"expires_at"`
RouteVersion string `json:"route_version"`
PolicyVersion string `json:"policy_version"`
PeerDirectoryVersion string `json:"peer_directory_version"`
Metadata map[string]any `json:"metadata"`
}
type dockerInstallProfileScope struct {
BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ArtifactEndpoints []string `json:"artifact_endpoints"`
DockerImageArtifactURLs []string `json:"docker_image_artifact_urls"`
DockerImageArtifactSHA256 string `json:"docker_image_artifact_sha256"`
DockerImageArtifactSizeBytes int64 `json:"docker_image_artifact_size_bytes"`
NodeAgentArtifactURLs []string `json:"node_agent_artifact_urls"`
NodeAgentArtifactSHA256 string `json:"node_agent_artifact_sha256"`
NodeAgentArtifactSizeBytes int64 `json:"node_agent_artifact_size_bytes"`
Roles []string `json:"roles"`
NodeName string `json:"node_name"`
NodeGroupID string `json:"node_group_id"`
Image string `json:"image"`
ContainerName string `json:"container_name"`
StateDir string `json:"state_dir"`
InstallDir string `json:"install_dir"`
StartupMode string `json:"startup_mode"`
Network string `json:"network"`
RestartPolicy string `json:"restart_policy"`
PullImage *bool `json:"pull_image"`
Replace *bool `json:"replace"`
DockerVPNGatewayEnabled *bool `json:"docker_vpn_gateway_enabled"`
WorkloadSupervisionEnabled *bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled *bool `json:"mesh_synthetic_runtime_enabled"`
MeshProductionForwardingEnabled *bool `json:"mesh_production_forwarding_enabled"`
MeshListenAddr string `json:"mesh_listen_addr"`
MeshListenPortMode string `json:"mesh_listen_port_mode"`
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
MeshNATType string `json:"mesh_nat_type"`
MeshRegion string `json:"mesh_region"`
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
EnrollmentPollTimeoutSeconds int `json:"enrollment_poll_timeout_seconds"`
ProductionObservationSinkCapacity int `json:"production_observation_sink_capacity"`
}
func dockerInstallProfileFromScope(input DockerInstallProfileRequest, scopeRaw json.RawMessage) (DockerInstallProfile, error) {
var scope dockerInstallProfileScope
if len(scopeRaw) > 0 {
if !json.Valid(scopeRaw) {
return DockerInstallProfile{}, ErrInvalidPayload
}
if err := json.Unmarshal(scopeRaw, &scope); err != nil {
return DockerInstallProfile{}, ErrInvalidPayload
}
}
nodeName := firstNonEmptyString(strings.TrimSpace(input.NodeName), scope.NodeName)
if nodeName == "" {
nodeName = "docker-node"
}
containerName := firstNonEmptyString(scope.ContainerName, "rap-node-agent-"+safeInstallProfileSlug(nodeName))
roles := trimStringSlice(scope.Roles)
profile := DockerInstallProfile{
SchemaVersion: "rap.docker_install_profile.v1",
BackendURL: strings.TrimRight(strings.TrimSpace(scope.BackendURL), "/"),
ControlPlaneEndpoints: trimStringSlice(scope.ControlPlaneEndpoints),
ArtifactEndpoints: trimEndpointSlice(scope.ArtifactEndpoints),
Roles: roles,
NodeName: nodeName,
Image: firstNonEmptyString(scope.Image, "rap-node-agent:latest"),
ContainerName: containerName,
StateDir: firstNonEmptyString(scope.StateDir, "/var/lib/rap/nodes/"+safeInstallProfileSlug(nodeName)),
Network: firstNonEmptyString(scope.Network, "host"),
RestartPolicy: firstNonEmptyString(scope.RestartPolicy, "unless-stopped"),
PullImage: boolPtrValue(scope.PullImage, false),
Replace: boolPtrValue(scope.Replace, true),
DockerVPNGatewayEnabled: boolPtrValue(scope.DockerVPNGatewayEnabled, containsString(roles, "vpn-exit")),
WorkloadSupervisionEnabled: boolPtrValue(scope.WorkloadSupervisionEnabled, false),
MeshSyntheticRuntimeEnabled: boolPtrValue(scope.MeshSyntheticRuntimeEnabled, true),
MeshProductionForwardingEnabled: boolPtrValue(scope.MeshProductionForwardingEnabled, false),
MeshListenAddr: firstNonEmptyString(scope.MeshListenAddr, ":19131"),
MeshListenPortMode: firstNonEmptyString(strings.ToLower(strings.TrimSpace(scope.MeshListenPortMode)), "auto"),
MeshListenAutoPortStart: positiveOrDefault(scope.MeshListenAutoPortStart, 19131),
MeshListenAutoPortEnd: positiveOrDefault(scope.MeshListenAutoPortEnd, 19231),
MeshAdvertiseEndpoint: strings.TrimRight(strings.TrimSpace(scope.MeshAdvertiseEndpoint), "/"),
MeshAdvertiseEndpointsJSON: scope.MeshAdvertiseEndpointsJSON,
MeshAdvertiseTransport: strings.TrimSpace(scope.MeshAdvertiseTransport),
MeshConnectivityMode: strings.TrimSpace(scope.MeshConnectivityMode),
MeshNATType: strings.TrimSpace(scope.MeshNATType),
MeshRegion: strings.TrimSpace(scope.MeshRegion),
HeartbeatIntervalSeconds: positiveOrDefault(scope.HeartbeatIntervalSeconds, 15),
EnrollmentPollIntervalSeconds: positiveOrDefault(scope.EnrollmentPollIntervalSeconds, 5),
EnrollmentPollTimeoutSeconds: nonNegativeOrDefault(scope.EnrollmentPollTimeoutSeconds, 0),
ProductionObservationSinkCapacity: scope.ProductionObservationSinkCapacity,
}
profile.DockerImageArtifact = dockerImageArtifactFromScope(profile.Image, profile.ArtifactEndpoints, scope)
if profile.BackendURL == "" && len(profile.ControlPlaneEndpoints) > 0 {
profile.BackendURL = profile.ControlPlaneEndpoints[0]
}
if profile.BackendURL == "" {
return DockerInstallProfile{}, ErrInvalidPayload
}
if len(profile.ArtifactEndpoints) == 0 {
if endpoint := defaultArtifactEndpointFromBackendURL(profile.BackendURL); endpoint != "" {
profile.ArtifactEndpoints = []string{endpoint}
profile.DockerImageArtifact = dockerImageArtifactFromScope(profile.Image, profile.ArtifactEndpoints, scope)
}
}
if len(profile.MeshAdvertiseEndpointsJSON) > 0 && !json.Valid(profile.MeshAdvertiseEndpointsJSON) {
return DockerInstallProfile{}, ErrInvalidPayload
}
switch profile.MeshListenPortMode {
case "manual", "auto", "disabled":
default:
return DockerInstallProfile{}, ErrInvalidPayload
}
if profile.MeshListenAutoPortStart > profile.MeshListenAutoPortEnd {
return DockerInstallProfile{}, ErrInvalidPayload
}
return profile, nil
}
func windowsInstallProfileFromScope(input DockerInstallProfileRequest, scopeRaw json.RawMessage) (WindowsInstallProfile, error) {
var scope dockerInstallProfileScope
if len(scopeRaw) > 0 {
if !json.Valid(scopeRaw) {
return WindowsInstallProfile{}, ErrInvalidPayload
}
if err := json.Unmarshal(scopeRaw, &scope); err != nil {
return WindowsInstallProfile{}, ErrInvalidPayload
}
}
nodeName := firstNonEmptyString(strings.TrimSpace(input.NodeName), scope.NodeName)
if nodeName == "" {
nodeName = "windows-node"
}
profile := WindowsInstallProfile{
SchemaVersion: "rap.windows_install_profile.v1",
BackendURL: strings.TrimRight(strings.TrimSpace(scope.BackendURL), "/"),
ControlPlaneEndpoints: trimStringSlice(scope.ControlPlaneEndpoints),
ArtifactEndpoints: trimEndpointSlice(scope.ArtifactEndpoints),
Roles: trimStringSlice(scope.Roles),
NodeName: nodeName,
StateDir: firstNonEmptyString(scope.StateDir, `C:\ProgramData\RAP\nodes\`+safeInstallProfileSlug(nodeName)),
InstallDir: firstNonEmptyString(scope.InstallDir, `C:\Program Files\RAP\`+safeInstallProfileSlug(nodeName)),
StartupMode: firstNonEmptyString(strings.ToLower(strings.TrimSpace(scope.StartupMode)), "auto"),
WorkloadSupervisionEnabled: boolPtrValue(scope.WorkloadSupervisionEnabled, false),
MeshSyntheticRuntimeEnabled: boolPtrValue(scope.MeshSyntheticRuntimeEnabled, true),
MeshProductionForwardingEnabled: boolPtrValue(scope.MeshProductionForwardingEnabled, false),
MeshListenAddr: firstNonEmptyString(scope.MeshListenAddr, ":19131"),
MeshListenPortMode: firstNonEmptyString(strings.ToLower(strings.TrimSpace(scope.MeshListenPortMode)), "auto"),
MeshListenAutoPortStart: positiveOrDefault(scope.MeshListenAutoPortStart, 19131),
MeshListenAutoPortEnd: positiveOrDefault(scope.MeshListenAutoPortEnd, 19231),
MeshAdvertiseEndpoint: strings.TrimRight(strings.TrimSpace(scope.MeshAdvertiseEndpoint), "/"),
MeshAdvertiseEndpointsJSON: scope.MeshAdvertiseEndpointsJSON,
MeshAdvertiseTransport: strings.TrimSpace(scope.MeshAdvertiseTransport),
MeshConnectivityMode: firstNonEmptyString(strings.TrimSpace(scope.MeshConnectivityMode), "outbound_only"),
MeshNATType: firstNonEmptyString(strings.TrimSpace(scope.MeshNATType), "unknown"),
MeshRegion: firstNonEmptyString(strings.TrimSpace(scope.MeshRegion), "windows"),
HeartbeatIntervalSeconds: positiveOrDefault(scope.HeartbeatIntervalSeconds, 15),
EnrollmentPollIntervalSeconds: positiveOrDefault(scope.EnrollmentPollIntervalSeconds, 5),
EnrollmentPollTimeoutSeconds: nonNegativeOrDefault(scope.EnrollmentPollTimeoutSeconds, 0),
ProductionObservationSinkCapacity: scope.ProductionObservationSinkCapacity,
}
profile.NodeAgentArtifact = windowsNodeAgentArtifactFromScope(profile.ArtifactEndpoints, scope)
if profile.BackendURL == "" && len(profile.ControlPlaneEndpoints) > 0 {
profile.BackendURL = profile.ControlPlaneEndpoints[0]
}
if profile.BackendURL == "" {
return WindowsInstallProfile{}, ErrInvalidPayload
}
if len(profile.ArtifactEndpoints) == 0 {
if endpoint := defaultArtifactEndpointFromBackendURL(profile.BackendURL); endpoint != "" {
profile.ArtifactEndpoints = []string{endpoint}
profile.NodeAgentArtifact = windowsNodeAgentArtifactFromScope(profile.ArtifactEndpoints, scope)
}
}
if len(profile.MeshAdvertiseEndpointsJSON) > 0 && !json.Valid(profile.MeshAdvertiseEndpointsJSON) {
return WindowsInstallProfile{}, ErrInvalidPayload
}
switch profile.MeshListenPortMode {
case "manual", "auto", "disabled":
default:
return WindowsInstallProfile{}, ErrInvalidPayload
}
switch profile.StartupMode {
case "auto", "system-task", "user-task", "none":
default:
return WindowsInstallProfile{}, ErrInvalidPayload
}
if profile.MeshListenAutoPortStart > profile.MeshListenAutoPortEnd {
return WindowsInstallProfile{}, ErrInvalidPayload
}
return profile, nil
}
func linuxInstallProfileFromScope(input DockerInstallProfileRequest, scopeRaw json.RawMessage) (LinuxInstallProfile, error) {
var scope dockerInstallProfileScope
if len(scopeRaw) > 0 {
if !json.Valid(scopeRaw) {
return LinuxInstallProfile{}, ErrInvalidPayload
}
if err := json.Unmarshal(scopeRaw, &scope); err != nil {
return LinuxInstallProfile{}, ErrInvalidPayload
}
}
nodeName := firstNonEmptyString(strings.TrimSpace(input.NodeName), scope.NodeName)
if nodeName == "" {
nodeName = "linux-node"
}
slug := safeInstallProfileSlug(nodeName)
profile := LinuxInstallProfile{
SchemaVersion: "rap.linux_install_profile.v1",
BackendURL: strings.TrimRight(strings.TrimSpace(scope.BackendURL), "/"),
ControlPlaneEndpoints: trimStringSlice(scope.ControlPlaneEndpoints),
ArtifactEndpoints: trimEndpointSlice(scope.ArtifactEndpoints),
Roles: trimStringSlice(scope.Roles),
NodeName: nodeName,
StateDir: firstNonEmptyString(scope.StateDir, "/var/lib/rap/nodes/"+slug),
InstallDir: firstNonEmptyString(scope.InstallDir, "/opt/rap/"+slug),
StartupMode: firstNonEmptyString(strings.ToLower(strings.TrimSpace(scope.StartupMode)), "systemd"),
WorkloadSupervisionEnabled: boolPtrValue(scope.WorkloadSupervisionEnabled, false),
MeshSyntheticRuntimeEnabled: boolPtrValue(scope.MeshSyntheticRuntimeEnabled, true),
MeshProductionForwardingEnabled: boolPtrValue(scope.MeshProductionForwardingEnabled, false),
MeshListenAddr: firstNonEmptyString(scope.MeshListenAddr, ":19131"),
MeshListenPortMode: firstNonEmptyString(strings.ToLower(strings.TrimSpace(scope.MeshListenPortMode)), "auto"),
MeshListenAutoPortStart: positiveOrDefault(scope.MeshListenAutoPortStart, 19131),
MeshListenAutoPortEnd: positiveOrDefault(scope.MeshListenAutoPortEnd, 19231),
MeshAdvertiseEndpoint: strings.TrimRight(strings.TrimSpace(scope.MeshAdvertiseEndpoint), "/"),
MeshAdvertiseEndpointsJSON: scope.MeshAdvertiseEndpointsJSON,
MeshAdvertiseTransport: firstNonEmptyString(strings.TrimSpace(scope.MeshAdvertiseTransport), "direct_http"),
MeshConnectivityMode: firstNonEmptyString(strings.TrimSpace(scope.MeshConnectivityMode), "outbound_only"),
MeshNATType: firstNonEmptyString(strings.TrimSpace(scope.MeshNATType), "unknown"),
MeshRegion: firstNonEmptyString(strings.TrimSpace(scope.MeshRegion), "linux"),
HeartbeatIntervalSeconds: positiveOrDefault(scope.HeartbeatIntervalSeconds, 15),
EnrollmentPollIntervalSeconds: positiveOrDefault(scope.EnrollmentPollIntervalSeconds, 5),
EnrollmentPollTimeoutSeconds: nonNegativeOrDefault(scope.EnrollmentPollTimeoutSeconds, 0),
ProductionObservationSinkCapacity: scope.ProductionObservationSinkCapacity,
}
profile.NodeAgentArtifact = linuxNodeAgentArtifactFromScope(profile.ArtifactEndpoints, scope)
if profile.BackendURL == "" && len(profile.ControlPlaneEndpoints) > 0 {
profile.BackendURL = profile.ControlPlaneEndpoints[0]
}
if profile.BackendURL == "" {
return LinuxInstallProfile{}, ErrInvalidPayload
}
if len(profile.ArtifactEndpoints) == 0 {
if endpoint := defaultArtifactEndpointFromBackendURL(profile.BackendURL); endpoint != "" {
profile.ArtifactEndpoints = []string{endpoint}
profile.NodeAgentArtifact = linuxNodeAgentArtifactFromScope(profile.ArtifactEndpoints, scope)
}
}
if len(profile.MeshAdvertiseEndpointsJSON) > 0 && !json.Valid(profile.MeshAdvertiseEndpointsJSON) {
return LinuxInstallProfile{}, ErrInvalidPayload
}
switch profile.MeshListenPortMode {
case "manual", "auto", "disabled":
default:
return LinuxInstallProfile{}, ErrInvalidPayload
}
switch profile.StartupMode {
case "auto", "systemd", "none":
default:
return LinuxInstallProfile{}, ErrInvalidPayload
}
if profile.MeshListenAutoPortStart > profile.MeshListenAutoPortEnd {
return LinuxInstallProfile{}, ErrInvalidPayload
}
return profile, nil
}
func linuxNodeAgentArtifactFromScope(artifactEndpoints []string, scope dockerInstallProfileScope) *DockerArtifact {
urls := trimEndpointSlice(scope.NodeAgentArtifactURLs)
if len(urls) == 0 {
for _, endpoint := range artifactEndpoints {
urls = append(urls, strings.TrimRight(endpoint, "/")+"/rap-node-agent-linux-amd64")
}
}
sha256 := strings.TrimSpace(scope.NodeAgentArtifactSHA256)
sizeBytes := scope.NodeAgentArtifactSizeBytes
if len(urls) == 0 && sha256 == "" {
return nil
}
return &DockerArtifact{
Kind: "linux_binary",
MediaType: "application/octet-stream",
FileName: "rap-node-agent-linux-amd64",
URLs: urls,
SHA256: sha256,
SizeBytes: sizeBytes,
}
}
func windowsNodeAgentArtifactFromScope(artifactEndpoints []string, scope dockerInstallProfileScope) *DockerArtifact {
urls := trimEndpointSlice(scope.NodeAgentArtifactURLs)
if len(urls) == 0 {
for _, endpoint := range artifactEndpoints {
urls = append(urls, strings.TrimRight(endpoint, "/")+"/rap-node-agent-windows-amd64.exe")
}
}
sha256 := strings.TrimSpace(scope.NodeAgentArtifactSHA256)
sizeBytes := scope.NodeAgentArtifactSizeBytes
if len(urls) == 0 && sha256 == "" {
return nil
}
return &DockerArtifact{
Kind: "windows_exe",
MediaType: "application/vnd.microsoft.portable-executable",
FileName: "rap-node-agent-windows-amd64.exe",
URLs: urls,
SHA256: sha256,
SizeBytes: sizeBytes,
}
}
func dockerImageArtifactFromScope(image string, artifactEndpoints []string, scope dockerInstallProfileScope) *DockerArtifact {
image = strings.TrimSpace(image)
if image == "" {
return nil
}
fileName := safeArtifactFileName(image) + ".tar"
urls := trimEndpointSlice(scope.DockerImageArtifactURLs)
if len(urls) == 0 {
for _, endpoint := range artifactEndpoints {
urls = append(urls, strings.TrimRight(endpoint, "/")+"/"+fileName)
}
}
sha256 := strings.TrimSpace(scope.DockerImageArtifactSHA256)
sizeBytes := scope.DockerImageArtifactSizeBytes
if len(urls) == 0 && sha256 == "" {
return nil
}
return &DockerArtifact{
Kind: "docker_image_tar",
Image: image,
MediaType: "application/vnd.docker.image.rootfs.diff.tar",
FileName: fileName,
URLs: urls,
SHA256: sha256,
SizeBytes: sizeBytes,
}
}
func defaultArtifactEndpointFromBackendURL(backendURL string) string {
value := strings.TrimRight(strings.TrimSpace(backendURL), "/")
if value == "" {
return ""
}
for _, suffix := range []string{"/api/v1", "/api"} {
if strings.HasSuffix(value, suffix) {
value = strings.TrimSuffix(value, suffix)
break
}
}
return strings.TrimRight(value, "/") + "/downloads"
}
type heartbeatMeshEndpointReport struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
NodeID string `json:"node_id"`
PeerEndpoint string `json:"peer_endpoint"`
Transport string `json:"transport"`
ConnectivityMode string `json:"connectivity_mode"`
NATType string `json:"nat_type"`
Region string `json:"region"`
EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates"`
ObservedAt *time.Time `json:"observed_at"`
}
type heartbeatRendezvousLeaseReport struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
NodeID string `json:"node_id"`
ObservedAt string `json:"observed_at"`
Leases []heartbeatRendezvousLeaseDetails `json:"leases"`
}
type heartbeatRendezvousLeaseDetails struct {
LeaseID string `json:"lease_id"`
PeerNodeID string `json:"peer_node_id"`
RelayNodeID string `json:"relay_node_id"`
RouteIDs []string `json:"route_ids"`
StaleRelay bool `json:"stale_relay"`
WithdrawalNeeded bool `json:"withdrawal_needed"`
ReselectionNeeded bool `json:"reselection_needed"`
ConnectionState string `json:"connection_state"`
Reason string `json:"reason"`
}
type meshRouteHealthObservationMetadata struct {
ObservationType string `json:"observation_type"`
RouteID string `json:"route_id"`
RoutePathDecisionApplied bool `json:"route_path_decision_applied"`
RoutePathDecisionSelectedRelayID string `json:"route_path_decision_selected_relay_id"`
RoutePathDecisionStaleRelayNodeID string `json:"route_path_decision_stale_relay_node_id"`
RoutePathDecisionRendezvousPeerNodeID string `json:"route_path_decision_rendezvous_peer_node_id"`
RoutePathDecisionRendezvousLeaseID string `json:"route_path_decision_rendezvous_lease_id"`
RoutePathDecisionRendezvousLeaseReason string `json:"route_path_decision_rendezvous_lease_reason"`
RoutePathDecisionSource string `json:"route_path_decision_source"`
ExpectedEffectiveHops []string `json:"expected_effective_hops"`
ObservedAckPath []string `json:"observed_ack_path"`
RoutePathDriftDetected bool `json:"route_path_drift_detected"`
FailureReason string `json:"failure_reason"`
ControlPlaneOnly bool `json:"control_plane_only"`
ProductionForwarding bool `json:"production_forwarding"`
ProductionPayloadForwarding bool `json:"production_payload_forwarding"`
RouteHealthProductionPayloadForwarding bool `json:"route_health_production_payload_forwarding"`
RouteHealthServicePayloadForwarding bool `json:"route_health_service_payload_forwarding"`
}
type rendezvousRelayFeedbackEntry struct {
ReporterNodeID string
RouteIDs []string
LeaseID string
PeerNodeID string
RelayNodeID string
ConnectionState string
Reason string
WithdrawalNeeded bool
ReselectionNeeded bool
ObservedAt time.Time
}
type rendezvousRelaySelection struct {
RelayNodeID string
Endpoint string
Score int
Reasons []string
}
type rendezvousRelayPolicy struct {
localNodeID string
now time.Time
links []MeshLinkObservation
feedback []rendezvousRelayFeedbackEntry
withdrawn map[string]RendezvousRelayPolicyDecision
replacements map[string]RendezvousRelayPolicyDecision
}
const (
maxScopedRecoverySeeds = 20
maxScopedRendezvousLeases = 20
defaultCoreMeshBootstrapPeerTarget = 3
rendezvousRelayFeedbackMaxAge = 2 * time.Minute
)
type nodeSelector struct {
NodeID string `json:"node_id"`
NodeIDs []string `json:"node_ids"`
}
func (s *Service) syntheticRouteFromIntent(input GetNodeSyntheticMeshConfigInput, intent MeshRouteIntent) (SyntheticMeshRouteConfig, map[string]string, map[string][]PeerEndpointCandidate, []PeerRecoverySeed, []PeerRendezvousLease, bool) {
if intent.Status != "active" {
return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false
}
var policy syntheticRoutePolicy
if err := json.Unmarshal(intent.Policy, &policy); err != nil {
return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false
}
if !policy.SyntheticEnabled {
return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false
}
var source nodeSelector
var destination nodeSelector
_ = json.Unmarshal(intent.SourceSelector, &source)
_ = json.Unmarshal(intent.DestinationSelector, &destination)
sourceNodeID := firstNodeID(source)
destinationNodeID := firstNodeID(destination)
hops := append([]string{}, policy.Hops...)
if len(hops) == 0 && sourceNodeID != "" && destinationNodeID != "" {
hops = []string{sourceNodeID, destinationNodeID}
}
if len(hops) < 2 || !containsString(hops, input.NodeID) {
return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false
}
if err := validatePeerEndpointCandidates(policy.PeerEndpointCandidates, hops); err != nil {
return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false
}
if err := validatePeerRecoverySeeds(policy.RecoverySeeds); err != nil {
return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false
}
if err := validatePeerRendezvousLeases(policy.RendezvousLeases, hops, s.now()); err != nil {
return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false
}
if sourceNodeID == "" {
sourceNodeID = hops[0]
}
if destinationNodeID == "" {
destinationNodeID = hops[len(hops)-1]
}
expiresAt := s.now().UTC().Add(5 * time.Minute)
if policy.ExpiresAt != nil {
expiresAt = policy.ExpiresAt.UTC()
}
if !expiresAt.After(s.now().UTC()) {
return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false
}
allowedChannels := policy.AllowedChannels
if len(allowedChannels) == 0 {
allowedChannels = []string{"fabric_control", "route_control"}
}
maxTTL := policy.MaxTTL
if maxTTL <= 0 {
maxTTL = 8
}
maxHops := policy.MaxHops
if maxHops <= 0 {
maxHops = 8
}
routeVersion := policy.RouteVersion
if routeVersion == "" {
routeVersion = intent.UpdatedAt.UTC().Format(time.RFC3339)
}
policyVersion := policy.PolicyVersion
if policyVersion == "" {
policyVersion = routeVersion
}
peerDirectoryVersion := policy.PeerDirectoryVersion
if peerDirectoryVersion == "" {
peerDirectoryVersion = routeVersion
}
route := SyntheticMeshRouteConfig{
RouteID: intent.ID,
ClusterID: input.ClusterID,
SourceNodeID: sourceNodeID,
DestinationNodeID: destinationNodeID,
Hops: hops,
AllowedChannels: allowedChannels,
ExpiresAt: expiresAt,
MaxTTL: maxTTL,
MaxHops: maxHops,
RouteVersion: routeVersion,
PolicyVersion: policyVersion,
PeerDirectoryVersion: peerDirectoryVersion,
}
return route,
scopedPeerEndpoints(policy.PeerEndpoints, hops),
scopedPeerEndpointCandidates(policy.PeerEndpointCandidates, hops),
policy.RecoverySeeds,
normalizeRendezvousLeases(policy.RendezvousLeases, route, s.now()),
true
}
func (s *Service) reportedEndpointConfig(ctx context.Context, clusterID string, localNodeID string, routePath []string, localPerspective endpointPerspective) (map[string]string, map[string][]PeerEndpointCandidate, error) {
peers := map[string]string{}
candidates := map[string][]PeerEndpointCandidate{}
for _, nodeID := range routePath {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" || nodeID == localNodeID {
continue
}
desiredEndpoint, desiredCandidates, err := s.desiredMeshListenerEndpointConfig(ctx, clusterID, nodeID, 0)
if err != nil {
return nil, nil, err
}
heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 1)
if err != nil {
return nil, nil, err
}
if len(heartbeats) == 0 && desiredEndpoint == "" && len(desiredCandidates) == 0 {
continue
}
peerEndpoint := desiredEndpoint
nodeCandidates := append([]PeerEndpointCandidate{}, desiredCandidates...)
if len(heartbeats) > 0 {
reportedEndpoint, reportedCandidates, ok := endpointReportFromHeartbeat(heartbeats[0])
if ok {
if peerEndpoint == "" {
peerEndpoint = reportedEndpoint
}
nodeCandidates = append(nodeCandidates, reportedCandidates...)
}
}
peerEndpoint, nodeCandidates = scopeEndpointReportForLocal(localPerspective, peerEndpoint, nodeCandidates)
if peerEndpoint != "" {
peers[nodeID] = peerEndpoint
}
if len(nodeCandidates) > 0 {
candidates[nodeID] = append(candidates[nodeID], nodeCandidates...)
}
}
return peers, candidates, nil
}
type endpointPerspective struct {
OutboundOnly bool
Region string
ControlPlaneURL string
ControlPlaneRelayEndpoint string
}
func (s *Service) localEndpointPerspective(ctx context.Context, clusterID, localNodeID string) (endpointPerspective, error) {
heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, localNodeID, 1)
if err != nil {
return endpointPerspective{}, err
}
if len(heartbeats) == 0 {
return endpointPerspective{}, nil
}
return endpointPerspectiveFromHeartbeat(heartbeats[0]), nil
}
func endpointPerspectiveFromHeartbeat(heartbeat NodeHeartbeat) endpointPerspective {
var metadata struct {
MeshEndpointReport heartbeatMeshEndpointReport `json:"mesh_endpoint_report"`
MeshListenerReport struct {
InboundReachability string `json:"inbound_reachability"`
OneWayConnectivity bool `json:"one_way_connectivity"`
} `json:"mesh_listener_report"`
MeshOutboundSessionReport struct {
ControlPlaneURL string `json:"control_plane_url"`
Status string `json:"status"`
} `json:"mesh_outbound_session_report"`
}
if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) {
return endpointPerspective{}
}
if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil {
return endpointPerspective{}
}
connectivity := strings.ToLower(strings.TrimSpace(metadata.MeshEndpointReport.ConnectivityMode))
reachability := strings.ToLower(strings.TrimSpace(metadata.MeshListenerReport.InboundReachability))
return endpointPerspective{
OutboundOnly: connectivity == "outbound_only" || reachability == "outbound_only" || metadata.MeshListenerReport.OneWayConnectivity,
Region: strings.TrimSpace(metadata.MeshEndpointReport.Region),
ControlPlaneURL: strings.TrimSpace(metadata.MeshOutboundSessionReport.ControlPlaneURL),
ControlPlaneRelayEndpoint: controlPlaneRelayEndpointFromURL(metadata.MeshOutboundSessionReport.ControlPlaneURL),
}
}
func controlPlaneRelayEndpointFromURL(raw string) string {
raw = strings.TrimRight(strings.TrimSpace(raw), "/")
if raw == "" {
return ""
}
parsed, err := url.Parse(raw)
if err != nil || parsed.Scheme == "" || parsed.Host == "" {
return ""
}
path := strings.TrimRight(parsed.Path, "/")
for _, suffix := range []string{"/api/v1", "/api"} {
if strings.HasSuffix(path, suffix) {
path = strings.TrimRight(strings.TrimSuffix(path, suffix), "/")
break
}
}
parsed.Path = path
parsed.RawPath = ""
parsed.RawQuery = ""
parsed.Fragment = ""
return strings.TrimRight(parsed.String(), "/")
}
func controlPlaneBootstrapRendezvousLease(clusterID, peerNodeID string, candidates []PeerEndpointCandidate, local endpointPerspective, now time.Time) (PeerRendezvousLease, bool) {
if !local.OutboundOnly || local.ControlPlaneRelayEndpoint == "" {
return PeerRendezvousLease{}, false
}
requiresRendezvous := false
for _, candidate := range candidates {
if endpointCandidateRequiresRendezvous(candidate) {
requiresRendezvous = true
break
}
}
if !requiresRendezvous {
return PeerRendezvousLease{}, false
}
issuedAt := now.UTC()
return PeerRendezvousLease{
LeaseID: "core-mesh-bootstrap-rv-" + peerNodeID + "-via-control-plane",
PeerNodeID: peerNodeID,
RelayNodeID: "control-plane-relay",
RelayEndpoint: local.ControlPlaneRelayEndpoint,
Transport: "relay_control",
ConnectivityMode: "relay_required",
RouteIDs: []string{"core-mesh-bootstrap"},
AllowedChannels: []string{"fabric_control", "route_control"},
Priority: 90,
ControlPlaneOnly: true,
IssuedAt: issuedAt,
ExpiresAt: issuedAt.Add(5 * time.Minute),
Reason: "control_plane_bootstrap_relay",
Metadata: json.RawMessage(`{
"cluster_id": "` + clusterID + `",
"source": "control_plane_bootstrap",
"service_workload_traffic": false,
"production_forwarding": false
}`),
}, true
}
func scopeEndpointReportForLocal(local endpointPerspective, endpoint string, candidates []PeerEndpointCandidate) (string, []PeerEndpointCandidate) {
if !local.OutboundOnly {
return endpoint, candidates
}
out := make([]PeerEndpointCandidate, 0, len(candidates))
directUsable := false
for _, candidate := range candidates {
if endpointCandidatePrivateForOffsite(candidate) {
candidate = relayRequiredCandidateForOffsite(candidate)
} else if !endpointCandidateRequiresRendezvous(candidate) {
directUsable = true
}
if candidate.Metadata == nil {
candidate.Metadata = json.RawMessage(`{}`)
}
out = append(out, candidate)
}
if !directUsable && endpointPrivateForOffsite(endpoint) {
endpoint = ""
}
return endpoint, out
}
func endpointCandidatePrivateForOffsite(candidate PeerEndpointCandidate) bool {
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
return connectivity == "private_lan" || reachability == "private" || endpointPrivateForOffsite(candidate.Address)
}
func endpointPrivateForOffsite(endpoint string) bool {
host := peerEndpointHost(endpoint)
if host == "" {
return false
}
ip := net.ParseIP(host)
return ip != nil && (ip.IsPrivate() || ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsUnspecified())
}
func relayRequiredCandidateForOffsite(candidate PeerEndpointCandidate) PeerEndpointCandidate {
candidate.Transport = "relay"
candidate.Reachability = "relay"
candidate.ConnectivityMode = "relay_required"
candidate.NATType = firstNonEmptyString(candidate.NATType, "unknown")
candidate.Priority += 200
candidate.PolicyTags = appendMissingString(candidate.PolicyTags, "offsite-private-lan-blocked")
candidate.PolicyTags = appendMissingString(candidate.PolicyTags, "relay-required")
return candidate
}
func endpointReportFromHeartbeat(heartbeat NodeHeartbeat) (string, []PeerEndpointCandidate, bool) {
var metadata struct {
MeshEndpointReport heartbeatMeshEndpointReport `json:"mesh_endpoint_report"`
}
if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) {
return "", nil, false
}
if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil {
return "", nil, false
}
report := metadata.MeshEndpointReport
if report.NodeID != "" && report.NodeID != heartbeat.NodeID {
return "", nil, false
}
if report.ClusterID != "" && report.ClusterID != heartbeat.ClusterID {
return "", nil, false
}
nodeID := heartbeat.NodeID
rawPeerEndpoint := strings.TrimSpace(report.PeerEndpoint)
peerEndpoint := rawPeerEndpoint
if isUnusableLocalPeerEndpoint(peerEndpoint) {
peerEndpoint = ""
}
out := make([]PeerEndpointCandidate, 0, len(report.EndpointCandidates))
for _, candidate := range report.EndpointCandidates {
if candidate.NodeID == "" {
candidate.NodeID = nodeID
}
if candidate.EndpointID == "" {
candidate.EndpointID = nodeID + "-reported"
}
if candidate.Address == "" {
candidate.Address = rawPeerEndpoint
}
if isUnusableLocalPeerEndpoint(candidate.Address) {
continue
}
if candidate.Transport == "" {
candidate.Transport = report.Transport
}
if candidate.ConnectivityMode == "" {
candidate.ConnectivityMode = report.ConnectivityMode
}
if candidate.NATType == "" {
candidate.NATType = report.NATType
}
if candidate.Region == "" {
candidate.Region = report.Region
}
if candidate.Reachability == "" {
candidate.Reachability = reachabilityFromConnectivityMode(candidate.ConnectivityMode)
}
if candidate.Metadata == nil {
candidate.Metadata = json.RawMessage(`{}`)
}
if candidate.NodeID != nodeID {
return "", nil, false
}
out = append(out, candidate)
}
if len(out) > 0 {
if err := validatePeerEndpointCandidates(map[string][]PeerEndpointCandidate{nodeID: out}, []string{nodeID}); err != nil {
return "", nil, false
}
}
return peerEndpoint, out, peerEndpoint != "" || len(out) > 0
}
func hasActiveNodeRole(roles []NodeRoleAssignment, role string) bool {
for _, item := range roles {
if item.Role == role && item.Status == "active" {
return true
}
}
return false
}
func nodeLastSeen(node ClusterNode) time.Time {
if node.LastSeenAt == nil {
return time.Time{}
}
return node.LastSeenAt.UTC()
}
func recoverySeedFromEndpointReport(nodeID, endpoint string, candidates []PeerEndpointCandidate, index int) PeerRecoverySeed {
nodeID = strings.TrimSpace(nodeID)
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
seed := PeerRecoverySeed{
NodeID: nodeID,
Endpoint: endpoint,
Transport: "direct_http",
Priority: 10 + index,
Metadata: json.RawMessage(`{"source":"core_mesh_bootstrap"}`),
}
for _, candidate := range candidates {
if strings.TrimSpace(candidate.Address) == "" {
continue
}
seed.Endpoint = strings.TrimRight(strings.TrimSpace(candidate.Address), "/")
if strings.TrimSpace(candidate.Transport) != "" {
seed.Transport = candidate.Transport
}
seed.ConnectivityMode = candidate.ConnectivityMode
seed.Region = candidate.Region
if candidate.LastVerifiedAt != nil {
seed.LastVerifiedAt = candidate.LastVerifiedAt
}
break
}
if seed.NodeID == "" || seed.Endpoint == "" {
return PeerRecoverySeed{}
}
return seed
}
func firstNonEmptyString(values ...string) string {
for _, value := range values {
if trimmed := strings.TrimSpace(value); trimmed != "" {
return trimmed
}
}
return ""
}
func trimStringSlice(values []string) []string {
out := []string{}
for _, value := range values {
if trimmed := strings.TrimSpace(value); trimmed != "" && !containsString(out, trimmed) {
out = append(out, trimmed)
}
}
return out
}
func trimEndpointSlice(values []string) []string {
out := []string{}
for _, value := range values {
trimmed := strings.TrimRight(strings.TrimSpace(value), "/")
if trimmed != "" && !containsString(out, trimmed) {
out = append(out, trimmed)
}
}
return out
}
func normalizeUpdateToken(value string) string {
return strings.ToLower(strings.TrimSpace(value))
}
func selectReleaseArtifact(releases []ReleaseVersion, input GetNodeUpdatePlanInput, policy NodeUpdatePolicy) (ReleaseVersion, ReleaseArtifact, bool) {
targetVersion := ""
if policy.TargetVersion != nil {
targetVersion = strings.TrimSpace(*policy.TargetVersion)
}
for _, release := range releases {
if release.Status != "active" {
continue
}
if targetVersion != "" && release.Version != targetVersion {
continue
}
for _, artifact := range release.Artifacts {
if normalizeUpdateToken(artifact.OS) == input.OS &&
normalizeUpdateToken(artifact.Arch) == input.Arch &&
normalizeUpdateToken(artifact.InstallType) == input.InstallType {
artifact.URLs = releaseArtifactURLs(artifact)
return release, artifact, true
}
}
}
return ReleaseVersion{}, ReleaseArtifact{}, false
}
func releaseArtifactURLs(artifact ReleaseArtifact) []string {
out := trimEndpointSlice(append([]string{artifact.URL}, artifact.URLs...))
if len(artifact.Metadata) > 0 && json.Valid(artifact.Metadata) {
var metadata struct {
URL string `json:"url"`
URLs []string `json:"urls"`
MirrorURLs []string `json:"mirror_urls"`
Mirrors []string `json:"mirrors"`
}
if err := json.Unmarshal(artifact.Metadata, &metadata); err == nil {
out = trimEndpointSlice(append(out, metadata.URL))
out = trimEndpointSlice(append(out, metadata.URLs...))
out = trimEndpointSlice(append(out, metadata.MirrorURLs...))
out = trimEndpointSlice(append(out, metadata.Mirrors...))
}
}
return out
}
func normalizeArtifactOrigin(value string) string {
value = strings.TrimRight(strings.TrimSpace(value), "/")
if value == "" {
return ""
}
parsed, err := url.Parse(value)
if err != nil || parsed.Scheme == "" || parsed.Host == "" {
return ""
}
return parsed.Scheme + "://" + parsed.Host
}
func absolutizeReleaseArtifact(artifact ReleaseArtifact, origin string) ReleaseArtifact {
if origin == "" {
return artifact
}
artifact.URL = absolutizeArtifactURL(artifact.URL, origin)
for i, raw := range artifact.URLs {
artifact.URLs[i] = absolutizeArtifactURL(raw, origin)
}
return artifact
}
func absolutizeArtifactURL(raw, origin string) string {
raw = strings.TrimSpace(raw)
if raw == "" || origin == "" {
return raw
}
parsed, err := url.Parse(raw)
if err == nil && parsed.IsAbs() {
return raw
}
if strings.HasPrefix(raw, "/") {
return origin + raw
}
return raw
}
func (s *Service) hostAgentPlatformMismatch(ctx context.Context, input GetNodeUpdatePlanInput) (bool, error) {
if input.Product != "rap-host-agent" {
return false, nil
}
if nodeUpdateRequestIsWindows(input) {
return false, nil
}
statuses, err := s.store.ListNodeUpdateStatuses(ctx, input.ClusterID, input.NodeID, 20)
if err != nil {
return false, err
}
for _, status := range statuses {
if status.Product != "rap-node-agent" || !nodeUpdateStatusLooksWindows(status) {
continue
}
return true, nil
}
return false, nil
}
func nodeUpdateRequestIsWindows(input GetNodeUpdatePlanInput) bool {
return normalizeUpdateToken(input.OS) == "windows" || strings.Contains(normalizeUpdateToken(input.InstallType), "windows")
}
func nodeUpdateStatusLooksWindows(status NodeUpdateStatus) bool {
var payload map[string]any
if len(status.Payload) == 0 || json.Unmarshal(status.Payload, &payload) != nil {
return false
}
for _, key := range []string{"os", "runtime_os", "goos"} {
if normalizeUpdateToken(stringFromAny(payload[key])) == "windows" {
return true
}
}
for _, key := range []string{"binary_path", "task", "windows_task_name"} {
value := strings.ToLower(strings.TrimSpace(stringFromAny(payload[key])))
if strings.Contains(value, `:\`) || strings.Contains(value, `.exe`) || strings.Contains(value, "rap node agent ") {
return true
}
}
return false
}
func stringFromAny(value any) string {
switch typed := value.(type) {
case string:
return typed
default:
return ""
}
}
func boolPtrValue(value *bool, fallback bool) bool {
if value == nil {
return fallback
}
return *value
}
func positiveOrDefault(value, fallback int) int {
if value > 0 {
return value
}
return fallback
}
func nonNegativeOrDefault(value, fallback int) int {
if value >= 0 {
return value
}
return fallback
}
func safeInstallProfileSlug(value string) string {
value = strings.ToLower(strings.TrimSpace(value))
var b strings.Builder
lastDash := false
for _, r := range value {
ok := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
if ok {
b.WriteRune(r)
lastDash = false
continue
}
if !lastDash {
b.WriteByte('-')
lastDash = true
}
}
return strings.Trim(b.String(), "-")
}
func safeArtifactFileName(value string) string {
value = strings.ToLower(strings.TrimSpace(value))
var b strings.Builder
lastDash := false
for _, r := range value {
ok := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '.' || r == '_' || r == '-'
if ok {
b.WriteRune(r)
lastDash = false
continue
}
if !lastDash {
b.WriteByte('-')
lastDash = true
}
}
out := strings.Trim(b.String(), "-")
if out == "" {
return "rap-node-agent"
}
return out
}
func (s *Service) rendezvousRelayFeedback(ctx context.Context, clusterID string, routePath []string, now time.Time) ([]rendezvousRelayFeedbackEntry, error) {
out := []rendezvousRelayFeedbackEntry{}
seenNodes := map[string]struct{}{}
for _, nodeID := range routePath {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" {
continue
}
if _, duplicate := seenNodes[nodeID]; duplicate {
continue
}
seenNodes[nodeID] = struct{}{}
heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 1)
if err != nil {
return nil, err
}
if len(heartbeats) == 0 {
continue
}
out = append(out, rendezvousRelayFeedbackFromHeartbeat(heartbeats[0], now)...)
}
return out, nil
}
func rendezvousRelayFeedbackFromHeartbeat(heartbeat NodeHeartbeat, now time.Time) []rendezvousRelayFeedbackEntry {
if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) {
return nil
}
if now.IsZero() {
now = time.Now().UTC()
} else {
now = now.UTC()
}
if heartbeat.ObservedAt.IsZero() ||
heartbeat.ObservedAt.After(now.Add(time.Minute)) ||
now.Sub(heartbeat.ObservedAt.UTC()) > rendezvousRelayFeedbackMaxAge {
return nil
}
var metadata struct {
MeshRendezvousLeaseReport heartbeatRendezvousLeaseReport `json:"mesh_rendezvous_lease_report"`
}
if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil {
return nil
}
report := metadata.MeshRendezvousLeaseReport
if report.NodeID != "" && report.NodeID != heartbeat.NodeID {
return nil
}
if report.ClusterID != "" && report.ClusterID != heartbeat.ClusterID {
return nil
}
out := []rendezvousRelayFeedbackEntry{}
for _, lease := range report.Leases {
if !lease.StaleRelay && !lease.WithdrawalNeeded && !lease.ReselectionNeeded {
continue
}
if strings.TrimSpace(lease.PeerNodeID) == "" || strings.TrimSpace(lease.RelayNodeID) == "" {
continue
}
out = append(out, rendezvousRelayFeedbackEntry{
ReporterNodeID: heartbeat.NodeID,
RouteIDs: append([]string{}, lease.RouteIDs...),
LeaseID: strings.TrimSpace(lease.LeaseID),
PeerNodeID: strings.TrimSpace(lease.PeerNodeID),
RelayNodeID: strings.TrimSpace(lease.RelayNodeID),
ConnectionState: strings.TrimSpace(lease.ConnectionState),
Reason: strings.TrimSpace(lease.Reason),
WithdrawalNeeded: lease.WithdrawalNeeded,
ReselectionNeeded: lease.ReselectionNeeded,
ObservedAt: heartbeat.ObservedAt.UTC(),
})
}
return out
}
func (s *Service) rendezvousRelayReplacementHints(ctx context.Context, clusterID string, routePath []string, now time.Time) ([]RendezvousRelayPolicyDecision, error) {
out := []RendezvousRelayPolicyDecision{}
seenNodes := map[string]struct{}{}
for _, nodeID := range routePath {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" {
continue
}
if _, duplicate := seenNodes[nodeID]; duplicate {
continue
}
seenNodes[nodeID] = struct{}{}
heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 1)
if err != nil {
return nil, err
}
if len(heartbeats) == 0 {
continue
}
out = append(out, rendezvousRelayReplacementHintsFromHeartbeat(heartbeats[0], now)...)
}
return out, nil
}
func rendezvousRelayReplacementHintsFromHeartbeat(heartbeat NodeHeartbeat, now time.Time) []RendezvousRelayPolicyDecision {
if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) {
return nil
}
if now.IsZero() {
now = time.Now().UTC()
} else {
now = now.UTC()
}
if heartbeat.ObservedAt.IsZero() ||
heartbeat.ObservedAt.After(now.Add(time.Minute)) ||
now.Sub(heartbeat.ObservedAt.UTC()) > rendezvousRelayFeedbackMaxAge {
return nil
}
var metadata struct {
MeshRoutePathDecisionReport struct {
ClusterID string `json:"cluster_id"`
NodeID string `json:"node_id"`
Decisions []RoutePathDecision `json:"decisions"`
} `json:"mesh_route_path_decision_report"`
}
if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil {
return nil
}
report := metadata.MeshRoutePathDecisionReport
if report.NodeID != "" && report.NodeID != heartbeat.NodeID {
return nil
}
if report.ClusterID != "" && report.ClusterID != heartbeat.ClusterID {
return nil
}
out := []RendezvousRelayPolicyDecision{}
for _, decision := range report.Decisions {
if strings.TrimSpace(decision.RouteID) == "" ||
decision.DecisionSource != "stale_relay_replacement" ||
strings.TrimSpace(decision.SelectedRelayID) == "" ||
strings.TrimSpace(decision.StaleRelayNodeID) == "" ||
decision.ProductionForwarding ||
!decision.ControlPlaneOnly ||
(!decision.ExpiresAt.IsZero() && !decision.ExpiresAt.After(now)) {
continue
}
peerNodeID := strings.TrimSpace(decision.RendezvousPeerNodeID)
if peerNodeID == "" {
peerNodeID = replacementPeerNodeIDFromDecision(decision)
}
if peerNodeID == "" {
continue
}
out = append(out, RendezvousRelayPolicyDecision{
RouteID: strings.TrimSpace(decision.RouteID),
PeerNodeID: peerNodeID,
StaleRelayNodeID: strings.TrimSpace(decision.StaleRelayNodeID),
SelectedRelayID: strings.TrimSpace(decision.SelectedRelayID),
SelectedEndpoint: strings.TrimRight(strings.TrimSpace(decision.SelectedRelayEndpoint), "/"),
Score: decision.PathScore,
Reason: "stale_relay_replacement",
ScoreReasons: append([]string{}, decision.ScoreReasons...),
ReporterNodeID: heartbeat.NodeID,
})
}
return out
}
func replacementPeerNodeIDFromDecision(decision RoutePathDecision) string {
effectiveHops := cleanRouteNodePath(decision.EffectiveHops)
selectedRelayID := strings.TrimSpace(decision.SelectedRelayID)
for index, nodeID := range effectiveHops {
if nodeID == selectedRelayID && index+1 < len(effectiveHops) {
return effectiveHops[index+1]
}
}
return strings.TrimSpace(decision.DestinationNodeID)
}
func replacementHintFeedback(hints []RendezvousRelayPolicyDecision, now time.Time) []rendezvousRelayFeedbackEntry {
if len(hints) == 0 {
return nil
}
if now.IsZero() {
now = time.Now().UTC()
} else {
now = now.UTC()
}
out := make([]rendezvousRelayFeedbackEntry, 0, len(hints))
for _, hint := range hints {
if strings.TrimSpace(hint.RouteID) == "" ||
strings.TrimSpace(hint.PeerNodeID) == "" ||
strings.TrimSpace(hint.StaleRelayNodeID) == "" ||
strings.TrimSpace(hint.SelectedRelayID) == "" {
continue
}
out = append(out, rendezvousRelayFeedbackEntry{
ReporterNodeID: strings.TrimSpace(hint.ReporterNodeID),
RouteIDs: []string{strings.TrimSpace(hint.RouteID)},
PeerNodeID: strings.TrimSpace(hint.PeerNodeID),
RelayNodeID: strings.TrimSpace(hint.StaleRelayNodeID),
ConnectionState: "replacement_hint",
Reason: "stale_relay_replacement_hint",
WithdrawalNeeded: true,
ReselectionNeeded: true,
ObservedAt: now,
})
}
return out
}
func rendezvousRelayRouteHealthFeedback(localNodeID string, route SyntheticMeshRouteConfig, links []MeshLinkObservation, now time.Time) []rendezvousRelayFeedbackEntry {
out := []rendezvousRelayFeedbackEntry{}
for _, link := range links {
item, ok := rendezvousRelayRouteHealthFeedbackFromLink(localNodeID, route, link, now)
if ok {
out = append(out, item)
}
}
return out
}
func rendezvousRelayRouteHealthFeedbackFromLink(localNodeID string, route SyntheticMeshRouteConfig, link MeshLinkObservation, now time.Time) (rendezvousRelayFeedbackEntry, bool) {
localNodeID = strings.TrimSpace(localNodeID)
if localNodeID == "" || link.SourceNodeID != localNodeID || strings.TrimSpace(route.RouteID) == "" {
return rendezvousRelayFeedbackEntry{}, false
}
if !meshLinkObservationFresh(link, now) {
return rendezvousRelayFeedbackEntry{}, false
}
metadata, ok := routeHealthMetadataFromLink(link)
if !ok ||
metadata.ObservationType != "synthetic_route_health" ||
strings.TrimSpace(metadata.RouteID) != route.RouteID ||
!metadata.RoutePathDecisionApplied ||
metadata.ProductionForwarding ||
metadata.ProductionPayloadForwarding ||
metadata.RouteHealthProductionPayloadForwarding ||
metadata.RouteHealthServicePayloadForwarding {
return rendezvousRelayFeedbackEntry{}, false
}
selectedRelayID := strings.TrimSpace(metadata.RoutePathDecisionSelectedRelayID)
if selectedRelayID == "" {
return rendezvousRelayFeedbackEntry{}, false
}
reason := ""
switch {
case metadata.RoutePathDriftDetected:
reason = "synthetic_route_health_drift"
case link.LinkStatus == "unreachable":
reason = "synthetic_route_health_unreachable"
case strings.TrimSpace(metadata.FailureReason) != "":
reason = "synthetic_route_health_failure"
default:
return rendezvousRelayFeedbackEntry{}, false
}
peerNodeID := routeHealthPeerNodeID(metadata, route, link.TargetNodeID)
if peerNodeID == "" {
return rendezvousRelayFeedbackEntry{}, false
}
return rendezvousRelayFeedbackEntry{
ReporterNodeID: link.SourceNodeID,
RouteIDs: []string{route.RouteID},
LeaseID: strings.TrimSpace(metadata.RoutePathDecisionRendezvousLeaseID),
PeerNodeID: peerNodeID,
RelayNodeID: selectedRelayID,
ConnectionState: reason,
Reason: reason,
WithdrawalNeeded: true,
ReselectionNeeded: true,
ObservedAt: link.ObservedAt.UTC(),
}, true
}
func routeHealthMetadataFromLink(link MeshLinkObservation) (meshRouteHealthObservationMetadata, bool) {
if len(link.Metadata) == 0 || !json.Valid(link.Metadata) {
return meshRouteHealthObservationMetadata{}, false
}
var metadata meshRouteHealthObservationMetadata
if err := json.Unmarshal(link.Metadata, &metadata); err != nil {
return meshRouteHealthObservationMetadata{}, false
}
return metadata, true
}
func meshLinkObservationFresh(link MeshLinkObservation, now time.Time) bool {
if now.IsZero() {
now = time.Now().UTC()
} else {
now = now.UTC()
}
return !link.ObservedAt.IsZero() &&
!link.ObservedAt.After(now.Add(time.Minute)) &&
now.Sub(link.ObservedAt.UTC()) <= rendezvousRelayFeedbackMaxAge
}
func routeHealthPeerNodeID(metadata meshRouteHealthObservationMetadata, route SyntheticMeshRouteConfig, targetNodeID string) string {
if peerNodeID := strings.TrimSpace(metadata.RoutePathDecisionRendezvousPeerNodeID); peerNodeID != "" {
return peerNodeID
}
selectedRelayID := strings.TrimSpace(metadata.RoutePathDecisionSelectedRelayID)
if peerNodeID := nodeAfterInPath(cleanRouteNodePath(metadata.ExpectedEffectiveHops), selectedRelayID); peerNodeID != "" {
return peerNodeID
}
if peerNodeID := nodeAfterInPath(cleanRouteNodePath(route.Hops), selectedRelayID); peerNodeID != "" {
return peerNodeID
}
if targetNodeID = strings.TrimSpace(targetNodeID); targetNodeID != "" {
return targetNodeID
}
return strings.TrimSpace(route.DestinationNodeID)
}
func nodeAfterInPath(path []string, nodeID string) string {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" {
return ""
}
for index, item := range path {
if item == nodeID && index+1 < len(path) {
return path[index+1]
}
}
return ""
}
func newRendezvousRelayPolicy(localNodeID string, links []MeshLinkObservation, now time.Time) *rendezvousRelayPolicy {
if now.IsZero() {
now = time.Now().UTC()
} else {
now = now.UTC()
}
return &rendezvousRelayPolicy{
localNodeID: strings.TrimSpace(localNodeID),
now: now,
links: append([]MeshLinkObservation{}, links...),
withdrawn: map[string]RendezvousRelayPolicyDecision{},
replacements: map[string]RendezvousRelayPolicyDecision{},
}
}
func (p *rendezvousRelayPolicy) addFeedback(items []rendezvousRelayFeedbackEntry) {
if p == nil {
return
}
p.feedback = append(p.feedback, items...)
}
func (p *rendezvousRelayPolicy) staleForLease(routeID string, lease PeerRendezvousLease) (rendezvousRelayFeedbackEntry, bool) {
if p == nil {
return rendezvousRelayFeedbackEntry{}, false
}
for _, item := range p.feedback {
if !rendezvousFeedbackAppliesToRoute(item, routeID) {
continue
}
if item.LeaseID != "" && lease.LeaseID != "" && item.LeaseID == lease.LeaseID {
return item, true
}
if item.PeerNodeID == lease.PeerNodeID && item.RelayNodeID == lease.RelayNodeID {
return item, true
}
}
return rendezvousRelayFeedbackEntry{}, false
}
func (p *rendezvousRelayPolicy) relayStale(routeID string, peerNodeID string, relayNodeID string) (rendezvousRelayFeedbackEntry, bool) {
if p == nil {
return rendezvousRelayFeedbackEntry{}, false
}
for _, item := range p.feedback {
if item.PeerNodeID == peerNodeID &&
item.RelayNodeID == relayNodeID &&
rendezvousFeedbackAppliesToRoute(item, routeID) {
return item, true
}
}
return rendezvousRelayFeedbackEntry{}, false
}
func (p *rendezvousRelayPolicy) hasStalePeer(routeID string, peerNodeID string) (rendezvousRelayFeedbackEntry, bool) {
if p == nil {
return rendezvousRelayFeedbackEntry{}, false
}
for _, item := range p.feedback {
if item.PeerNodeID == peerNodeID && rendezvousFeedbackAppliesToRoute(item, routeID) {
return item, true
}
}
return rendezvousRelayFeedbackEntry{}, false
}
func (p *rendezvousRelayPolicy) recordWithdrawal(route SyntheticMeshRouteConfig, lease PeerRendezvousLease, feedback rendezvousRelayFeedbackEntry) {
if p == nil {
return
}
key := route.RouteID + "\x00" + lease.LeaseID + "\x00" + lease.RelayNodeID
p.withdrawn[key] = RendezvousRelayPolicyDecision{
RouteID: route.RouteID,
PeerNodeID: lease.PeerNodeID,
WithdrawnLeaseID: lease.LeaseID,
StaleRelayNodeID: lease.RelayNodeID,
Reason: "stale_relay_withdrawn",
ReporterNodeID: feedback.ReporterNodeID,
}
}
func (p *rendezvousRelayPolicy) recordReplacement(route SyntheticMeshRouteConfig, peerNodeID string, feedback rendezvousRelayFeedbackEntry, selection rendezvousRelaySelection) {
if p == nil || selection.RelayNodeID == "" {
return
}
key := rendezvousRelayReplacementKey(route.RouteID, peerNodeID, feedback.RelayNodeID, selection.RelayNodeID)
p.replacements[key] = RendezvousRelayPolicyDecision{
RouteID: route.RouteID,
PeerNodeID: peerNodeID,
StaleRelayNodeID: feedback.RelayNodeID,
SelectedRelayID: selection.RelayNodeID,
SelectedEndpoint: selection.Endpoint,
Score: selection.Score,
Reason: "stale_relay_replacement",
ScoreReasons: append([]string{}, selection.Reasons...),
ReporterNodeID: feedback.ReporterNodeID,
}
}
func (p *rendezvousRelayPolicy) addReplacementHints(hints []RendezvousRelayPolicyDecision) {
if p == nil {
return
}
for _, hint := range hints {
hint.RouteID = strings.TrimSpace(hint.RouteID)
hint.PeerNodeID = strings.TrimSpace(hint.PeerNodeID)
hint.StaleRelayNodeID = strings.TrimSpace(hint.StaleRelayNodeID)
hint.SelectedRelayID = strings.TrimSpace(hint.SelectedRelayID)
hint.SelectedEndpoint = strings.TrimRight(strings.TrimSpace(hint.SelectedEndpoint), "/")
if hint.RouteID == "" || hint.PeerNodeID == "" || hint.StaleRelayNodeID == "" || hint.SelectedRelayID == "" {
continue
}
if hint.Reason == "" {
hint.Reason = "stale_relay_replacement"
}
if len(hint.ScoreReasons) == 0 {
hint.ScoreReasons = []string{"route_path_decision_hint"}
}
key := rendezvousRelayReplacementKey(hint.RouteID, hint.PeerNodeID, hint.StaleRelayNodeID, hint.SelectedRelayID)
existing, exists := p.replacements[key]
if !exists || hint.Score > existing.Score {
p.replacements[key] = hint
}
}
}
func (p *rendezvousRelayPolicy) report() *RendezvousRelayPolicyReport {
if p == nil || (len(p.feedback) == 0 && len(p.withdrawn) == 0 && len(p.replacements) == 0) {
return nil
}
decisions := make([]RendezvousRelayPolicyDecision, 0, len(p.withdrawn)+len(p.replacements))
for _, decision := range p.withdrawn {
decisions = append(decisions, decision)
}
for _, decision := range p.replacements {
decisions = append(decisions, decision)
}
sort.SliceStable(decisions, func(i, j int) bool {
if decisions[i].RouteID != decisions[j].RouteID {
return decisions[i].RouteID < decisions[j].RouteID
}
if decisions[i].PeerNodeID != decisions[j].PeerNodeID {
return decisions[i].PeerNodeID < decisions[j].PeerNodeID
}
if decisions[i].Reason != decisions[j].Reason {
return decisions[i].Reason < decisions[j].Reason
}
return decisions[i].SelectedRelayID < decisions[j].SelectedRelayID
})
return &RendezvousRelayPolicyReport{
SchemaVersion: "c17z15.rendezvous_relay_policy.v1",
ScoringMode: "route_adjacency_endpoint_priority_mesh_link_health_synthetic_route_health_feedback",
FeedbackMaxAgeSeconds: int(rendezvousRelayFeedbackMaxAge / time.Second),
StaleRelayCount: len(p.feedback),
WithdrawnLeaseCount: len(p.withdrawn),
ReplacementLeaseCount: len(p.replacements),
Decisions: decisions,
}
}
func (p *rendezvousRelayPolicy) replacementDecision(routeID string, peerNodeID string, selectedRelayID string) (RendezvousRelayPolicyDecision, bool) {
if p == nil {
return RendezvousRelayPolicyDecision{}, false
}
for _, decision := range p.replacements {
if decision.RouteID == routeID &&
decision.PeerNodeID == peerNodeID &&
decision.SelectedRelayID == selectedRelayID {
return decision, true
}
}
return RendezvousRelayPolicyDecision{}, false
}
func rendezvousRelayReplacementKey(routeID string, peerNodeID string, staleRelayNodeID string, selectedRelayID string) string {
return strings.TrimSpace(routeID) + "\x00" +
strings.TrimSpace(peerNodeID) + "\x00" +
strings.TrimSpace(staleRelayNodeID) + "\x00" +
strings.TrimSpace(selectedRelayID)
}
func routePathDecisionReport(generation string, decisions []RoutePathDecision) *RoutePathDecisionReport {
return routePathDecisionReportWithRecoveryPolicy(generation, decisions, defaultFabricServiceChannelRecoveryPolicy())
}
func routePathDecisionReportWithRecoveryPolicy(generation string, decisions []RoutePathDecision, policy FabricServiceChannelRecoveryPolicy) *RoutePathDecisionReport {
if len(decisions) == 0 {
return nil
}
policy = normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy())
out := append([]RoutePathDecision{}, decisions...)
sort.SliceStable(out, func(i, j int) bool {
if out[i].RouteID != out[j].RouteID {
return out[i].RouteID < out[j].RouteID
}
return out[i].DecisionID < out[j].DecisionID
})
replacements := 0
degraded := 0
rebuildRequests := 0
rebuildApplied := 0
recoveryHysteresis := 0
recoveryPromoted := 0
recoveryDemoted := 0
for _, decision := range out {
if decision.DecisionSource == "stale_relay_replacement" ||
decision.DecisionSource == "service_channel_feedback_replacement" ||
decision.DecisionSource == "service_channel_feedback_exit_pool_replacement" ||
decision.DecisionSource == "service_channel_feedback_entry_pool_replacement" ||
decision.DecisionSource == "service_channel_feedback_entry_exit_pool_replacement" ||
(decision.DecisionSource == "service_channel_remediation_command" && strings.TrimSpace(decision.ReplacementRouteID) != "") {
replacements++
}
if containsString(decision.ScoreReasons, "service_channel_recovery_hysteresis") {
recoveryHysteresis++
}
if containsString(decision.ScoreReasons, "service_channel_recovery_promoted") {
recoveryPromoted++
}
if containsString(decision.ScoreReasons, "service_channel_recovery_demoted") {
recoveryDemoted++
}
if decision.DecisionSource == "service_channel_feedback_no_alternate" || decision.RebuildStatus == "no_alternate" {
degraded++
}
switch decision.RebuildStatus {
case "requested", "pending_degraded_fallback", "no_alternate", "deferred_by_policy", "expired":
rebuildRequests++
case "applied":
rebuildRequests++
rebuildApplied++
}
}
return &RoutePathDecisionReport{
SchemaVersion: "c17z18.route_path_decisions.v1",
DecisionMode: "control_plane_effective_path_from_relay_policy_and_service_channel_feedback",
Generation: generation,
RecoveryPolicy: fabricServiceChannelRecoveryPolicyRef(policy),
DecisionCount: len(out),
ReplacementDecisionCount: replacements,
DegradedDecisionCount: degraded,
RebuildRequestCount: rebuildRequests,
RebuildAppliedCount: rebuildApplied,
RecoveryHysteresisCount: recoveryHysteresis,
RecoveryPromotedCount: recoveryPromoted,
RecoveryDemotedCount: recoveryDemoted,
ControlPlaneOnly: true,
ProductionForwarding: false,
Decisions: out,
}
}
func serviceChannelFeedbackRequestsRebuild(item fabricServiceChannelRouteFeedback) bool {
if item.RouteID == "" || !item.Fenced || item.ManualRetry {
return false
}
return item.RouteRebuildRecommended ||
item.DegradedFallbackRecommended ||
item.ConsecutiveFailures >= 2 ||
containsString(item.Reasons, "service_channel_route_rebuild_recommended")
}
func serviceChannelRebuildRequestID(routeID, reporterNodeID, generation string) string {
base := strings.TrimSpace(routeID)
if base == "" {
base = "route"
}
if strings.TrimSpace(reporterNodeID) != "" {
base += "-" + strings.TrimSpace(reporterNodeID)
}
if strings.TrimSpace(generation) != "" {
base += "-" + strings.TrimSpace(generation)
}
return base + "-rebuild"
}
func (s *Service) serviceChannelRouteReplacementDecision(input GetNodeSyntheticMeshConfigInput, fencedRoute SyntheticMeshRouteConfig, intents []MeshRouteIntent, feedback map[string]fabricServiceChannelRouteFeedback, generation string) RoutePathDecision {
routeFeedback := feedback[fencedRoute.RouteID]
decision := RoutePathDecision{
DecisionID: fencedRoute.RouteID + "-path-" + input.NodeID + "-service-channel-feedback",
RouteID: fencedRoute.RouteID,
ClusterID: fencedRoute.ClusterID,
LocalNodeID: input.NodeID,
SourceNodeID: fencedRoute.SourceNodeID,
DestinationNodeID: fencedRoute.DestinationNodeID,
OriginalHops: append([]string{}, fencedRoute.Hops...),
EffectiveHops: []string{},
DecisionSource: "service_channel_feedback_no_alternate",
Generation: generation,
PathScore: 0,
ScoreReasons: []string{"service_channel_fenced_route", "no_unfenced_alternate_route"},
ControlPlaneOnly: true,
ProductionForwarding: false,
ExpiresAt: fencedRoute.ExpiresAt.UTC(),
}
applyServiceChannelFeedbackCorrelationToDecision(&decision, routeFeedback)
if serviceChannelFeedbackRequestsRebuild(routeFeedback) {
decision.RebuildRequestID = serviceChannelRebuildRequestID(fencedRoute.RouteID, input.NodeID, generation)
decision.RebuildStatus = "pending_degraded_fallback"
decision.RebuildReason = "service_channel_feedback_rebuild_requested"
decision.RebuildAttempt = routeFeedback.ConsecutiveFailures
decision.ScoreReasons = append(decision.ScoreReasons, "service_channel_rebuild_requested", "backend_relay_degraded_fallback_until_rebuild")
if routeFeedback.DegradedFallbackRecommended {
decision.ScoreReasons = append(decision.ScoreReasons, "service_channel_degraded_fallback_recommended")
}
}
replacement, replacementFeedback, ok := s.selectServiceChannelRouteReplacement(input, fencedRoute, intents, feedback)
if ok {
decision.ReplacementRouteID = replacement.RouteID
decision.EffectiveHops = append([]string{}, replacement.Hops...)
decision.DecisionSource = "service_channel_feedback_replacement"
decision.PathScore = serviceChannelReplacementRouteScore(replacement)
decision.ScoreReasons = []string{"service_channel_fenced_route", "selected_unfenced_alternate_route"}
if replacement.SourceNodeID != fencedRoute.SourceNodeID {
decision.DecisionSource = "service_channel_feedback_entry_pool_replacement"
decision.ScoreReasons = append(decision.ScoreReasons, "selected_unfenced_entry_pool_route")
}
if replacement.DestinationNodeID != fencedRoute.DestinationNodeID {
decision.DecisionSource = "service_channel_feedback_exit_pool_replacement"
decision.ScoreReasons = append(decision.ScoreReasons, "selected_unfenced_exit_pool_route")
}
if replacement.SourceNodeID != fencedRoute.SourceNodeID && replacement.DestinationNodeID != fencedRoute.DestinationNodeID {
decision.DecisionSource = "service_channel_feedback_entry_exit_pool_replacement"
decision.ScoreReasons = append(decision.ScoreReasons, "selected_unfenced_entry_exit_pool_route")
}
if decision.RebuildRequestID != "" {
decision.RebuildStatus = "applied"
decision.RebuildReason = "service_channel_feedback_rebuild_applied_to_alternate"
decision.ScoreReasons = append(decision.ScoreReasons, "service_channel_rebuild_applied")
}
if replacementFeedback.RouteID != "" && !replacementFeedback.Fenced {
decision.PathScore += 10000
decision.ScoreReasons = append(decision.ScoreReasons, "active_healthy_feedback_dampening_window")
decision.ScoreReasons = append(decision.ScoreReasons, replacementFeedback.Reasons...)
}
decision.ScoreReasons = dedupeStrings(decision.ScoreReasons)
if replacement.ExpiresAt.Before(decision.ExpiresAt) {
decision.ExpiresAt = replacement.ExpiresAt.UTC()
}
}
decision.PreviousHopID, decision.NextHopID, decision.LocalRole = routePathLocalPosition(decision.EffectiveHops, input.NodeID, "", "")
return decision
}
func applyServiceChannelFeedbackCorrelationToDecision(decision *RoutePathDecision, feedback fabricServiceChannelRouteFeedback) {
if decision == nil || feedback.RouteID == "" {
return
}
decision.FeedbackObservationID = feedback.ObservationID
decision.FeedbackSource = feedback.Source
if !feedback.ObservedAt.IsZero() {
observedAt := feedback.ObservedAt.UTC()
decision.FeedbackObservedAt = &observedAt
}
if !feedback.ExpiresAt.IsZero() {
expiresAt := feedback.ExpiresAt.UTC()
decision.FeedbackExpiresAt = &expiresAt
}
decision.FeedbackChannelID = feedback.ChannelID
decision.FeedbackResourceID = feedback.ResourceID
decision.FeedbackViolationStatus = feedback.ViolationStatus
decision.FeedbackViolationReason = feedback.ViolationReason
}
func (s *Service) selectServiceChannelRouteReplacement(input GetNodeSyntheticMeshConfigInput, fencedRoute SyntheticMeshRouteConfig, intents []MeshRouteIntent, feedback map[string]fabricServiceChannelRouteFeedback) (SyntheticMeshRouteConfig, fabricServiceChannelRouteFeedback, bool) {
var selected SyntheticMeshRouteConfig
var selectedFeedback fabricServiceChannelRouteFeedback
selectedScore := -1
scopes := fabricServiceChannelRouteIntentReplacementScopes(intents)
for _, intent := range intents {
route, _, _, _, _, ok := s.syntheticRouteFromIntent(input, intent)
if !ok || route.RouteID == fencedRoute.RouteID {
continue
}
if !fabricServiceChannelRoutesShareReplacementScope(fencedRoute, route, scopes) {
continue
}
if !fabricChannelsIntersect(route.AllowedChannels, fencedRoute.AllowedChannels) {
continue
}
if item, ok := feedback[route.RouteID]; ok && item.Fenced {
continue
}
routeFeedback := feedback[route.RouteID]
score := serviceChannelReplacementRouteScore(route) + intent.Priority
if routeFeedback.RouteID != "" {
score += 10000
}
if route.DestinationNodeID != fencedRoute.DestinationNodeID {
score -= 5
}
if route.SourceNodeID != fencedRoute.SourceNodeID {
score -= 10
}
if score > selectedScore || (score == selectedScore && route.RouteID < selected.RouteID) {
selected = route
selectedFeedback = routeFeedback
selectedScore = score
}
}
return selected, selectedFeedback, selected.RouteID != ""
}
func serviceChannelReplacementRouteScore(route SyntheticMeshRouteConfig) int {
score := 1000 - len(route.Hops)*10
if score < 1 {
return 1
}
return score
}
func routePathDecisionForRoute(route SyntheticMeshRouteConfig, localNodeID string, leases []PeerRendezvousLease, relayPolicy *rendezvousRelayPolicy, generation string, serviceFeedback fabricServiceChannelRouteFeedback) RoutePathDecision {
decision := RoutePathDecision{
DecisionID: route.RouteID + "-path-" + localNodeID,
RouteID: route.RouteID,
ClusterID: route.ClusterID,
LocalNodeID: localNodeID,
SourceNodeID: route.SourceNodeID,
DestinationNodeID: route.DestinationNodeID,
OriginalHops: append([]string{}, route.Hops...),
EffectiveHops: append([]string{}, route.Hops...),
DecisionSource: "route_intent",
Generation: generation,
PathScore: 1000,
ScoreReasons: []string{"route_intent_hops"},
ControlPlaneOnly: true,
ProductionForwarding: false,
ExpiresAt: route.ExpiresAt.UTC(),
}
if serviceFeedback.ManualRetry {
decision.ScoreReasons = append(decision.ScoreReasons, "service_channel_route_retry_after_operator_expire")
decision.ScoreReasons = append(decision.ScoreReasons, serviceFeedback.Reasons...)
decision.ScoreReasons = dedupeStrings(decision.ScoreReasons)
if serviceFeedback.RetryCooldownUntil != nil && serviceFeedback.RetryCooldownUntil.Before(decision.ExpiresAt) {
decision.ExpiresAt = serviceFeedback.RetryCooldownUntil.UTC()
}
}
var replacementLease PeerRendezvousLease
var replacementDecision RendezvousRelayPolicyDecision
replacementFound := false
for _, lease := range leases {
if !containsString(lease.RouteIDs, route.RouteID) {
continue
}
relayDecision, ok := relayPolicy.replacementDecision(route.RouteID, lease.PeerNodeID, lease.RelayNodeID)
if !ok && lease.Reason != "stale_relay_replacement" {
continue
}
if !ok {
relayDecision = RendezvousRelayPolicyDecision{
RouteID: route.RouteID,
PeerNodeID: lease.PeerNodeID,
SelectedRelayID: lease.RelayNodeID,
SelectedEndpoint: lease.RelayEndpoint,
Reason: "stale_relay_replacement",
}
}
if !replacementFound || relayDecision.Score > replacementDecision.Score {
replacementFound = true
replacementLease = lease
replacementDecision = relayDecision
}
}
if replacementFound {
decision.DecisionID = route.RouteID + "-path-" + localNodeID + "-via-" + replacementLease.RelayNodeID
decision.EffectiveHops = effectiveRoutePathWithReplacement(route.Hops, replacementLease.PeerNodeID, replacementDecision.StaleRelayNodeID, replacementLease.RelayNodeID)
decision.SelectedRelayID = replacementLease.RelayNodeID
decision.SelectedRelayEndpoint = replacementLease.RelayEndpoint
decision.StaleRelayNodeID = replacementDecision.StaleRelayNodeID
decision.RendezvousPeerNodeID = replacementLease.PeerNodeID
decision.RendezvousLeaseID = replacementLease.LeaseID
decision.RendezvousLeaseReason = replacementLease.Reason
decision.DecisionSource = "stale_relay_replacement"
decision.PathScore = replacementDecision.Score
if decision.PathScore == 0 {
decision.PathScore = 1000
}
decision.ScoreReasons = append([]string{}, replacementDecision.ScoreReasons...)
if len(decision.ScoreReasons) == 0 {
decision.ScoreReasons = []string{"relay_replacement_policy"}
}
}
decision.PreviousHopID, decision.NextHopID, decision.LocalRole = routePathLocalPosition(decision.EffectiveHops, localNodeID, decision.SelectedRelayID, decision.StaleRelayNodeID)
return decision
}
func effectiveRoutePathWithReplacement(original []string, peerNodeID string, staleRelayNodeID string, selectedRelayID string) []string {
out := make([]string, 0, len(original)+1)
for _, nodeID := range original {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" || (staleRelayNodeID != "" && nodeID == staleRelayNodeID) {
continue
}
out = append(out, nodeID)
}
if selectedRelayID == "" || containsString(out, selectedRelayID) {
return out
}
peerIndex := -1
for index, nodeID := range out {
if nodeID == peerNodeID {
peerIndex = index
break
}
}
if peerIndex < 0 {
return append(out, selectedRelayID)
}
out = append(out, "")
copy(out[peerIndex+1:], out[peerIndex:])
out[peerIndex] = selectedRelayID
return out
}
func routePathLocalPosition(path []string, localNodeID string, selectedRelayID string, staleRelayNodeID string) (string, string, string) {
localIndex := -1
for index, nodeID := range path {
if nodeID == localNodeID {
localIndex = index
break
}
}
if localIndex < 0 {
if staleRelayNodeID != "" && localNodeID == staleRelayNodeID {
return "", "", "withdrawn_relay"
}
return "", "", "not_on_effective_path"
}
previous := ""
next := ""
if localIndex > 0 {
previous = path[localIndex-1]
}
if localIndex < len(path)-1 {
next = path[localIndex+1]
}
role := "transit"
switch {
case localIndex == 0:
role = "entry"
case localIndex == len(path)-1:
role = "exit"
case selectedRelayID != "" && localNodeID == selectedRelayID:
role = "selected_relay"
}
return previous, next, role
}
func rendezvousFeedbackAppliesToRoute(item rendezvousRelayFeedbackEntry, routeID string) bool {
if strings.TrimSpace(routeID) == "" || len(item.RouteIDs) == 0 {
return true
}
return containsString(item.RouteIDs, routeID)
}
func reachabilityFromConnectivityMode(connectivityMode string) string {
switch connectivityMode {
case "outbound_only":
return "outbound_only"
case "relay_required":
return "relay"
case "private_lan":
return "private"
case "direct":
return "public"
default:
return "unknown"
}
}
func validatePeerRecoverySeeds(seeds []PeerRecoverySeed) error {
if len(seeds) > maxScopedRecoverySeeds {
return ErrInvalidPayload
}
seen := map[string]struct{}{}
for _, seed := range seeds {
key := strings.TrimSpace(seed.NodeID) + "\x00" + strings.TrimSpace(seed.Endpoint)
if strings.TrimSpace(seed.NodeID) == "" ||
strings.TrimSpace(seed.Endpoint) == "" ||
!isPeerEndpointTransport(seed.Transport) ||
(seed.ConnectivityMode != "" && !isPeerEndpointConnectivityMode(seed.ConnectivityMode)) ||
(len(seed.Metadata) > 0 && !json.Valid(seed.Metadata)) {
return ErrInvalidPayload
}
if _, duplicate := seen[key]; duplicate {
return ErrInvalidPayload
}
seen[key] = struct{}{}
}
return nil
}
func validatePeerRendezvousLeases(leases []PeerRendezvousLease, routePath []string, now time.Time) error {
if len(leases) > maxScopedRendezvousLeases {
return ErrInvalidPayload
}
now = now.UTC()
seen := map[string]struct{}{}
for _, lease := range leases {
peerNodeID := strings.TrimSpace(lease.PeerNodeID)
relayNodeID := strings.TrimSpace(lease.RelayNodeID)
relayEndpoint := strings.TrimSpace(lease.RelayEndpoint)
transport := strings.TrimSpace(lease.Transport)
if peerNodeID == "" ||
relayNodeID == "" ||
relayEndpoint == "" ||
peerNodeID == relayNodeID ||
!containsString(routePath, peerNodeID) ||
!containsString(routePath, relayNodeID) ||
(transport != "" && !isPeerRendezvousTransport(transport)) ||
(!lease.ExpiresAt.IsZero() && !lease.ExpiresAt.After(now)) ||
(len(lease.Metadata) > 0 && !json.Valid(lease.Metadata)) {
return ErrInvalidPayload
}
if strings.TrimSpace(lease.LeaseID) == "" {
continue
}
if _, duplicate := seen[lease.LeaseID]; duplicate {
return ErrInvalidPayload
}
seen[lease.LeaseID] = struct{}{}
}
return nil
}
func normalizeRendezvousLeases(leases []PeerRendezvousLease, route SyntheticMeshRouteConfig, now time.Time) []PeerRendezvousLease {
out := make([]PeerRendezvousLease, 0, len(leases))
now = now.UTC()
for _, lease := range leases {
lease.PeerNodeID = strings.TrimSpace(lease.PeerNodeID)
lease.RelayNodeID = strings.TrimSpace(lease.RelayNodeID)
lease.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
if lease.LeaseID == "" {
lease.LeaseID = route.RouteID + "-rv-" + lease.PeerNodeID + "-via-" + lease.RelayNodeID
}
if lease.Transport == "" {
lease.Transport = "relay_control"
}
if lease.ConnectivityMode == "" {
lease.ConnectivityMode = "relay_required"
}
if lease.Priority <= 0 {
lease.Priority = 100
}
if len(lease.RouteIDs) == 0 {
lease.RouteIDs = []string{route.RouteID}
} else if !containsString(lease.RouteIDs, route.RouteID) {
lease.RouteIDs = append(append([]string{}, lease.RouteIDs...), route.RouteID)
}
lease.AllowedChannels = controlPlaneAllowedChannels(firstNonEmptyStringSlice(lease.AllowedChannels, route.AllowedChannels))
if len(lease.AllowedChannels) == 0 {
lease.AllowedChannels = []string{"fabric_control", "route_control"}
}
lease.ControlPlaneOnly = true
if lease.IssuedAt.IsZero() {
lease.IssuedAt = now
} else {
lease.IssuedAt = lease.IssuedAt.UTC()
}
if lease.ExpiresAt.IsZero() || (!route.ExpiresAt.IsZero() && lease.ExpiresAt.After(route.ExpiresAt)) {
lease.ExpiresAt = route.ExpiresAt.UTC()
} else {
lease.ExpiresAt = lease.ExpiresAt.UTC()
}
if lease.Reason == "" {
lease.Reason = "policy_rendezvous_lease"
}
if lease.Metadata == nil {
lease.Metadata = json.RawMessage(`{}`)
}
if !lease.ExpiresAt.IsZero() && lease.ExpiresAt.After(now) {
out = append(out, lease)
}
}
return out
}
func scopedRendezvousLeases(leases []PeerRendezvousLease, route SyntheticMeshRouteConfig, localNodeID string, relayPolicy *rendezvousRelayPolicy, now time.Time) []PeerRendezvousLease {
if !containsString(route.Hops, localNodeID) {
return nil
}
normalized := normalizeRendezvousLeases(leases, route, now)
out := make([]PeerRendezvousLease, 0, len(normalized))
for _, lease := range normalized {
if feedback, stale := relayPolicy.staleForLease(route.RouteID, lease); stale {
relayPolicy.recordWithdrawal(route, lease, feedback)
continue
}
if containsString(route.Hops, lease.PeerNodeID) && containsString(route.Hops, lease.RelayNodeID) {
out = append(out, lease)
}
}
return out
}
func derivedRendezvousLeases(route SyntheticMeshRouteConfig, peers map[string]string, candidates map[string][]PeerEndpointCandidate, localNodeID string, relayPolicy *rendezvousRelayPolicy, now time.Time) []PeerRendezvousLease {
if !containsString(route.Hops, localNodeID) {
return nil
}
out := []PeerRendezvousLease{}
for peerNodeID, items := range candidates {
peerNodeID = strings.TrimSpace(peerNodeID)
if peerNodeID == "" || !containsString(route.Hops, peerNodeID) || !peerEndpointCandidatesRequireRendezvous(items) {
continue
}
selection := selectRendezvousRelay(route, peerNodeID, localNodeID, peers, candidates, relayPolicy)
if selection.RelayNodeID == "" || selection.Endpoint == "" {
continue
}
_, replacement := relayPolicy.hasStalePeer(route.RouteID, peerNodeID)
reason := rendezvousLeaseReason(items)
if replacement {
reason = "stale_relay_replacement"
}
lease := PeerRendezvousLease{
LeaseID: route.RouteID + "-rv-" + peerNodeID + "-via-" + selection.RelayNodeID,
PeerNodeID: peerNodeID,
RelayNodeID: selection.RelayNodeID,
RelayEndpoint: selection.Endpoint,
Transport: "relay_control",
ConnectivityMode: "relay_required",
RouteIDs: []string{route.RouteID},
AllowedChannels: controlPlaneAllowedChannels(route.AllowedChannels),
Priority: rendezvousLeasePriority(items),
ControlPlaneOnly: true,
IssuedAt: now.UTC(),
ExpiresAt: route.ExpiresAt.UTC(),
Reason: reason,
Metadata: rendezvousRelayLeaseMetadata(selection, replacement),
}
if len(lease.AllowedChannels) == 0 {
lease.AllowedChannels = []string{"fabric_control", "route_control"}
}
if lease.Priority <= 0 {
lease.Priority = 100
}
if lease.ExpiresAt.After(now.UTC()) {
out = append(out, lease)
if feedback, ok := relayPolicy.hasStalePeer(route.RouteID, peerNodeID); ok && feedback.RelayNodeID != selection.RelayNodeID {
relayPolicy.recordReplacement(route, peerNodeID, feedback, selection)
}
}
}
return out
}
func selectRendezvousRelay(route SyntheticMeshRouteConfig, peerNodeID string, localNodeID string, peers map[string]string, candidates map[string][]PeerEndpointCandidate, relayPolicy *rendezvousRelayPolicy) rendezvousRelaySelection {
routePath := route.Hops
peerIndex := -1
for index, nodeID := range routePath {
if nodeID == peerNodeID {
peerIndex = index
break
}
}
preferred := []string{}
if peerIndex > 0 {
preferred = append(preferred, routePath[peerIndex-1])
}
if peerIndex >= 0 && peerIndex < len(routePath)-1 {
preferred = append(preferred, routePath[peerIndex+1])
}
preferred = append(preferred, routePath...)
seen := map[string]struct{}{}
relayCandidates := []rendezvousRelaySelection{}
for _, relayNodeID := range preferred {
relayNodeID = strings.TrimSpace(relayNodeID)
if relayNodeID == "" || relayNodeID == peerNodeID {
continue
}
if _, duplicate := seen[relayNodeID]; duplicate {
continue
}
seen[relayNodeID] = struct{}{}
if _, stale := relayPolicy.relayStale(route.RouteID, peerNodeID, relayNodeID); stale {
continue
}
endpoint, endpointScore, endpointReasons := relayControlEndpointForNode(relayNodeID, peers, candidates)
if endpoint == "" {
continue
}
score, scoreReasons := rendezvousRelayCandidateScore(route.RouteID, routePath, peerIndex, relayNodeID, localNodeID, endpointScore, endpointReasons, relayPolicy)
relayCandidates = append(relayCandidates, rendezvousRelaySelection{
RelayNodeID: relayNodeID,
Endpoint: endpoint,
Score: score,
Reasons: scoreReasons,
})
}
if len(relayCandidates) == 0 {
return rendezvousRelaySelection{}
}
sort.SliceStable(relayCandidates, func(i, j int) bool {
if relayCandidates[i].Score != relayCandidates[j].Score {
return relayCandidates[i].Score > relayCandidates[j].Score
}
return relayCandidates[i].RelayNodeID < relayCandidates[j].RelayNodeID
})
return relayCandidates[0]
}
func relayControlEndpointForNode(nodeID string, peers map[string]string, candidates map[string][]PeerEndpointCandidate) (string, int, []string) {
if endpoint := strings.TrimRight(strings.TrimSpace(peers[nodeID]), "/"); isUsableHTTPControlEndpoint(endpoint) {
return endpoint, 80, []string{"reported_peer_endpoint"}
}
items := append([]PeerEndpointCandidate{}, candidates[nodeID]...)
sort.SliceStable(items, func(i, j int) bool {
if items[i].Priority != items[j].Priority {
return items[i].Priority < items[j].Priority
}
return items[i].EndpointID < items[j].EndpointID
})
for _, candidate := range items {
if endpointCandidateRequiresRendezvous(candidate) {
continue
}
endpoint := strings.TrimRight(strings.TrimSpace(candidate.Address), "/")
if isUsableHTTPControlEndpoint(endpoint) {
score := 70
reasons := []string{"endpoint_candidate"}
if candidate.Priority > 0 {
score += maxInt(0, 50-candidate.Priority)
}
if hasPolicyTag(candidate.PolicyTags, "fast-path") {
score += 25
reasons = append(reasons, "fast_path")
}
if hasPolicyTag(candidate.PolicyTags, "same-site") || hasPolicyTag(candidate.PolicyTags, "corp-lan") || hasPolicyTag(candidate.PolicyTags, "private-lan") {
score += 20
reasons = append(reasons, "same_site")
}
if strings.EqualFold(candidate.ConnectivityMode, "direct") {
score += 10
reasons = append(reasons, "direct")
}
return endpoint, score, reasons
}
}
return "", 0, nil
}
func rendezvousRelayCandidateScore(routeID string, routePath []string, peerIndex int, relayNodeID string, localNodeID string, endpointScore int, endpointReasons []string, relayPolicy *rendezvousRelayPolicy) (int, []string) {
score := 500 + endpointScore
reasons := append([]string{}, endpointReasons...)
relayIndex := -1
for index, nodeID := range routePath {
if nodeID == relayNodeID {
relayIndex = index
break
}
}
if peerIndex >= 0 && relayIndex >= 0 {
distance := absInt(peerIndex - relayIndex)
switch {
case distance == 1:
score += 180
reasons = append(reasons, "adjacent_to_peer")
case distance == 2:
score += 120
reasons = append(reasons, "near_peer")
default:
score += maxInt(0, 80-distance*10)
reasons = append(reasons, "route_path_candidate")
}
}
if relayIndex == 0 && len(routePath) > 2 {
score -= 120
reasons = append(reasons, "entry_relay_fallback")
}
if relayNodeID == localNodeID {
score += 40
reasons = append(reasons, "local_entry_relay")
}
linkScore, linkReasons := rendezvousRelayLinkScore(relayNodeID, relayPolicy)
score += linkScore
reasons = append(reasons, linkReasons...)
routeHealthScore, routeHealthReasons := rendezvousRelayRouteHealthScore(routeID, relayNodeID, relayPolicy)
score += routeHealthScore
reasons = append(reasons, routeHealthReasons...)
return score, reasons
}
func rendezvousRelayLinkScore(relayNodeID string, relayPolicy *rendezvousRelayPolicy) (int, []string) {
if relayPolicy == nil || relayPolicy.localNodeID == "" {
return 0, nil
}
var latest *MeshLinkObservation
for i := range relayPolicy.links {
link := &relayPolicy.links[i]
if link.SourceNodeID != relayPolicy.localNodeID || link.TargetNodeID != relayNodeID {
continue
}
if !link.ObservedAt.IsZero() && relayPolicy.now.Sub(link.ObservedAt.UTC()) > rendezvousRelayFeedbackMaxAge {
continue
}
if latest == nil || link.ObservedAt.After(latest.ObservedAt) {
latest = link
}
}
if latest == nil {
return 0, nil
}
switch latest.LinkStatus {
case "reachable":
score := 60
reasons := []string{"mesh_link_reachable"}
if latest.QualityScore != nil {
score += *latest.QualityScore
reasons = append(reasons, "mesh_link_quality")
}
if latest.LatencyMs != nil {
score += maxInt(0, 80-*latest.LatencyMs)
reasons = append(reasons, "mesh_link_latency")
}
return score, reasons
case "unreachable":
return -250, []string{"mesh_link_unreachable"}
default:
return 0, nil
}
}
func rendezvousRelayRouteHealthScore(routeID string, relayNodeID string, relayPolicy *rendezvousRelayPolicy) (int, []string) {
if relayPolicy == nil || relayPolicy.localNodeID == "" {
return 0, nil
}
routeID = strings.TrimSpace(routeID)
relayNodeID = strings.TrimSpace(relayNodeID)
if routeID == "" || relayNodeID == "" {
return 0, nil
}
var latest *MeshLinkObservation
var latestMetadata meshRouteHealthObservationMetadata
for i := range relayPolicy.links {
link := &relayPolicy.links[i]
if link.SourceNodeID != relayPolicy.localNodeID || !meshLinkObservationFresh(*link, relayPolicy.now) {
continue
}
metadata, ok := routeHealthMetadataFromLink(*link)
if !ok ||
metadata.ObservationType != "synthetic_route_health" ||
strings.TrimSpace(metadata.RouteID) != routeID ||
strings.TrimSpace(metadata.RoutePathDecisionSelectedRelayID) != relayNodeID ||
metadata.ProductionForwarding ||
metadata.ProductionPayloadForwarding ||
metadata.RouteHealthProductionPayloadForwarding ||
metadata.RouteHealthServicePayloadForwarding {
continue
}
if latest == nil || link.ObservedAt.After(latest.ObservedAt) {
latest = link
latestMetadata = metadata
}
}
if latest == nil {
return 0, nil
}
if latestMetadata.RoutePathDriftDetected {
return -360, []string{"route_health_drift"}
}
if latest.LinkStatus == "unreachable" || strings.TrimSpace(latestMetadata.FailureReason) != "" {
return -320, []string{"route_health_unreachable"}
}
if latest.LinkStatus != "reachable" {
return 0, nil
}
score := 90
reasons := []string{"route_health_reachable", "route_health_no_drift"}
if latest.QualityScore != nil {
score += *latest.QualityScore
reasons = append(reasons, "route_health_quality")
}
if latest.LatencyMs != nil {
score += maxInt(0, 100-*latest.LatencyMs)
reasons = append(reasons, "route_health_latency")
}
return score, reasons
}
func rendezvousRelayLeaseMetadata(selection rendezvousRelaySelection, replacement bool) json.RawMessage {
payload := map[string]any{
"source": "control-plane",
"derived_from": "endpoint_candidate",
"lease_refresh_contract": "node_scoped_synthetic_config_get",
"relay_replacement_contract": "stale_relay_feedback_policy",
"relay_selection_score": selection.Score,
"relay_selection_score_reasons": selection.Reasons,
"production_payload_forwarding": false,
}
if replacement {
payload["replacement_for_stale_relay"] = true
}
raw, err := json.Marshal(payload)
if err != nil {
return json.RawMessage(`{"source":"control-plane","derived_from":"endpoint_candidate","lease_refresh_contract":"node_scoped_synthetic_config_get","relay_replacement_contract":"stale_relay_feedback_policy","production_payload_forwarding":false}`)
}
return raw
}
func hasPolicyTag(tags []string, want string) bool {
want = strings.ToLower(strings.TrimSpace(want))
for _, tag := range tags {
if strings.ToLower(strings.TrimSpace(tag)) == want {
return true
}
}
return false
}
func maxInt(a int, b int) int {
if a > b {
return a
}
return b
}
func absInt(value int) int {
if value < 0 {
return -value
}
return value
}
func peerEndpointCandidatesRequireRendezvous(candidates []PeerEndpointCandidate) bool {
for _, candidate := range candidates {
if endpointCandidateRequiresRendezvous(candidate) {
return true
}
}
return false
}
func endpointCandidateRequiresRendezvous(candidate PeerEndpointCandidate) bool {
transport := strings.ToLower(strings.TrimSpace(candidate.Transport))
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
connectivityMode := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
return strings.Contains(transport, "relay") ||
strings.Contains(transport, "outbound") ||
reachability == "relay" ||
reachability == "outbound_only" ||
connectivityMode == "relay_required" ||
connectivityMode == "outbound_only"
}
func rendezvousLeasePriority(candidates []PeerEndpointCandidate) int {
priority := 0
for _, candidate := range candidates {
if !endpointCandidateRequiresRendezvous(candidate) {
continue
}
if priority == 0 || (candidate.Priority > 0 && candidate.Priority < priority) {
priority = candidate.Priority
}
}
return priority
}
func rendezvousLeaseReason(candidates []PeerEndpointCandidate) string {
for _, candidate := range candidates {
connectivityMode := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
if connectivityMode == "outbound_only" || reachability == "outbound_only" {
return "auto_outbound_only"
}
if connectivityMode == "relay_required" || reachability == "relay" {
return "auto_relay_required"
}
}
return "auto_rendezvous_required"
}
func mergeRendezvousLeases(out map[string]PeerRendezvousLease, leases []PeerRendezvousLease) {
for _, lease := range leases {
if lease.Metadata == nil {
lease.Metadata = json.RawMessage(`{}`)
}
key := strings.TrimSpace(lease.LeaseID)
if key == "" {
key = lease.PeerNodeID + "\x00" + lease.RelayNodeID + "\x00" + lease.RelayEndpoint
}
existing, ok := out[key]
if !ok || lease.Priority < existing.Priority || existing.ExpiresAt.Before(lease.ExpiresAt) {
out[key] = lease
}
}
}
func sortedRendezvousLeases(items map[string]PeerRendezvousLease, limit int) []PeerRendezvousLease {
out := make([]PeerRendezvousLease, 0, len(items))
for _, item := range items {
out = append(out, item)
}
sort.SliceStable(out, func(i, j int) bool {
if out[i].Priority != out[j].Priority {
return out[i].Priority < out[j].Priority
}
if out[i].PeerNodeID != out[j].PeerNodeID {
return out[i].PeerNodeID < out[j].PeerNodeID
}
if out[i].RelayNodeID != out[j].RelayNodeID {
return out[i].RelayNodeID < out[j].RelayNodeID
}
return out[i].LeaseID < out[j].LeaseID
})
if len(out) > limit {
out = out[:limit]
}
return out
}
func markPeerDirectoryRendezvousLeases(directory map[string]*PeerDirectoryEntry, leases []PeerRendezvousLease, localNodeID string) {
for _, lease := range leases {
if lease.PeerNodeID != "" && lease.PeerNodeID != localNodeID {
entry := peerDirectoryEntry(directory, lease.PeerNodeID)
entry.CandidateCount++
if !containsString(entry.ConnectivityModes, "relay_required") {
entry.ConnectivityModes = append(entry.ConnectivityModes, "relay_required")
}
}
if lease.RelayNodeID != "" && lease.RelayNodeID != localNodeID {
entry := peerDirectoryEntry(directory, lease.RelayNodeID)
entry.EndpointCount++
if !containsString(entry.ConnectivityModes, "relay_control") {
entry.ConnectivityModes = append(entry.ConnectivityModes, "relay_control")
}
}
}
}
func mergePeerDirectoryRoute(directory map[string]*PeerDirectoryEntry, route SyntheticMeshRouteConfig, localNodeID string) {
for _, nodeID := range route.Hops {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" || nodeID == localNodeID {
continue
}
entry := peerDirectoryEntry(directory, nodeID)
if !containsString(entry.RouteIDs, route.RouteID) {
entry.RouteIDs = append(entry.RouteIDs, route.RouteID)
}
}
}
func mergePeerDirectoryCandidates(directory map[string]*PeerDirectoryEntry, nodeID string, candidates []PeerEndpointCandidate) {
entry := peerDirectoryEntry(directory, nodeID)
entry.CandidateCount += len(candidates)
for _, candidate := range candidates {
if strings.TrimSpace(candidate.ConnectivityMode) != "" && !containsString(entry.ConnectivityModes, candidate.ConnectivityMode) {
entry.ConnectivityModes = append(entry.ConnectivityModes, candidate.ConnectivityMode)
}
}
}
func peerDirectoryEntry(directory map[string]*PeerDirectoryEntry, nodeID string) *PeerDirectoryEntry {
if entry, ok := directory[nodeID]; ok {
return entry
}
entry := &PeerDirectoryEntry{NodeID: nodeID}
directory[nodeID] = entry
return entry
}
func mergeRecoverySeeds(out map[string]PeerRecoverySeed, seeds []PeerRecoverySeed) {
for _, seed := range seeds {
if seed.Metadata == nil {
seed.Metadata = json.RawMessage(`{}`)
}
key := seed.NodeID + "\x00" + seed.Endpoint
existing, ok := out[key]
if !ok || seed.Priority < existing.Priority {
out[key] = seed
}
}
}
func sortedRecoverySeeds(items map[string]PeerRecoverySeed, limit int) []PeerRecoverySeed {
out := make([]PeerRecoverySeed, 0, len(items))
for _, item := range items {
out = append(out, item)
}
sort.SliceStable(out, func(i, j int) bool {
if out[i].Priority != out[j].Priority {
return out[i].Priority < out[j].Priority
}
if out[i].NodeID != out[j].NodeID {
return out[i].NodeID < out[j].NodeID
}
return out[i].Endpoint < out[j].Endpoint
})
if len(out) > limit {
out = out[:limit]
}
return out
}
func markPeerDirectoryRecoverySeeds(directory map[string]*PeerDirectoryEntry, seeds []PeerRecoverySeed) {
for _, seed := range seeds {
entry := peerDirectoryEntry(directory, seed.NodeID)
entry.RecoverySeed = true
if strings.TrimSpace(seed.ConnectivityMode) != "" && !containsString(entry.ConnectivityModes, seed.ConnectivityMode) {
entry.ConnectivityModes = append(entry.ConnectivityModes, seed.ConnectivityMode)
}
}
}
func sortedPeerDirectory(items map[string]*PeerDirectoryEntry) []PeerDirectoryEntry {
out := make([]PeerDirectoryEntry, 0, len(items))
for _, entry := range items {
sort.Strings(entry.RouteIDs)
sort.Strings(entry.ConnectivityModes)
if entry.NodeID != "" {
out = append(out, *entry)
}
}
sort.SliceStable(out, func(i, j int) bool {
return out[i].NodeID < out[j].NodeID
})
return out
}
func validatePeerEndpointCandidates(candidates map[string][]PeerEndpointCandidate, routePath []string) error {
if len(candidates) == 0 {
return nil
}
for nodeID, items := range candidates {
if strings.TrimSpace(nodeID) == "" || !containsString(routePath, nodeID) {
return ErrInvalidPayload
}
for _, candidate := range items {
if strings.TrimSpace(candidate.EndpointID) == "" ||
strings.TrimSpace(candidate.NodeID) == "" ||
candidate.NodeID != nodeID ||
strings.TrimSpace(candidate.Address) == "" ||
!isPeerEndpointTransport(candidate.Transport) ||
!isPeerEndpointReachability(candidate.Reachability) ||
!isPeerEndpointConnectivityMode(candidate.ConnectivityMode) ||
(candidate.NATType != "" && !isPeerEndpointNATType(candidate.NATType)) {
return ErrInvalidPayload
}
if len(candidate.Metadata) > 0 && !json.Valid(candidate.Metadata) {
return ErrInvalidPayload
}
}
}
return nil
}
func scopedPeerEndpoints(peers map[string]string, routePath []string) map[string]string {
out := map[string]string{}
for nodeID, endpoint := range peers {
endpoint = strings.TrimSpace(endpoint)
if containsString(routePath, nodeID) && endpoint != "" && !isUnusableLocalPeerEndpoint(endpoint) {
out[nodeID] = endpoint
}
}
return out
}
func scopedPeerEndpointCandidates(candidates map[string][]PeerEndpointCandidate, routePath []string) map[string][]PeerEndpointCandidate {
out := map[string][]PeerEndpointCandidate{}
for nodeID, items := range candidates {
if !containsString(routePath, nodeID) {
continue
}
for _, candidate := range items {
if isUnusableLocalPeerEndpoint(candidate.Address) {
continue
}
if candidate.Metadata == nil {
candidate.Metadata = json.RawMessage(`{}`)
}
out[nodeID] = append(out[nodeID], candidate)
}
}
return out
}
func isPeerEndpointTransport(value string) bool {
switch value {
case "direct_http", "direct_tcp_tls", "wss", "relay", "outbound_reverse":
return true
default:
return false
}
}
func isPeerRendezvousTransport(value string) bool {
switch value {
case "relay_control", "relay", "wss", "direct_tcp_tls":
return true
default:
return false
}
}
func isPeerEndpointReachability(value string) bool {
switch value {
case "public", "private", "relay", "outbound_only", "unknown":
return true
default:
return false
}
}
func isPeerEndpointConnectivityMode(value string) bool {
switch value {
case "direct", "private_lan", "relay_required", "outbound_only", "unknown":
return true
default:
return false
}
}
func isPeerEndpointNATType(value string) bool {
switch value {
case "unknown", "none", "full_cone", "restricted", "port_restricted", "symmetric", "blocked":
return true
default:
return false
}
}
func controlPlaneAllowedChannels(channels []string) []string {
out := []string{}
for _, channel := range channels {
channel = strings.TrimSpace(channel)
switch channel {
case "fabric_control", "route_control":
if !containsString(out, channel) {
out = append(out, channel)
}
}
}
return out
}
func isHTTPControlEndpoint(endpoint string) bool {
endpoint = strings.ToLower(strings.TrimSpace(endpoint))
return strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://")
}
func isUsableHTTPControlEndpoint(endpoint string) bool {
return isHTTPControlEndpoint(endpoint) && !isUnusableLocalPeerEndpoint(endpoint)
}
func isUnusableLocalPeerEndpoint(endpoint string) bool {
host := peerEndpointHost(endpoint)
if host == "" {
return false
}
if strings.EqualFold(host, "localhost") {
return true
}
ip := net.ParseIP(host)
return ip != nil && (ip.IsLoopback() || ip.IsUnspecified())
}
func peerEndpointHost(endpoint string) string {
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
if endpoint == "" {
return ""
}
if host, _, err := net.SplitHostPort(endpoint); err == nil {
return strings.Trim(host, "[]")
}
if parsed, err := url.Parse(endpoint); err == nil && parsed.Host != "" {
if host, _, err := net.SplitHostPort(parsed.Host); err == nil {
return strings.Trim(host, "[]")
}
return strings.Trim(parsed.Host, "[]")
}
return strings.Trim(endpoint, "[]")
}
func firstNodeID(selector nodeSelector) string {
if strings.TrimSpace(selector.NodeID) != "" {
return strings.TrimSpace(selector.NodeID)
}
for _, nodeID := range selector.NodeIDs {
if strings.TrimSpace(nodeID) != "" {
return strings.TrimSpace(nodeID)
}
}
return ""
}
func cleanRouteNodePath(values []string) []string {
out := make([]string, 0, len(values))
for _, value := range values {
value = strings.TrimSpace(value)
if value != "" {
out = append(out, value)
}
}
return out
}
func containsString(values []string, needle string) bool {
needle = strings.TrimSpace(needle)
if needle == "" {
return false
}
for _, value := range values {
if strings.TrimSpace(value) == needle {
return true
}
}
return false
}
func appendMissingString(values []string, value string) []string {
if containsString(values, value) {
return values
}
return append(values, value)
}
func generateFencingToken() (string, error) {
buf := make([]byte, 32)
if _, err := rand.Read(buf); err != nil {
return "", err
}
return "rap_vpn_fence_" + hex.EncodeToString(buf), nil
}