12527 lines
482 KiB
Go
12527 lines
482 KiB
Go
package cluster
|
|
|
|
import (
|
|
"context"
|
|
"crypto/rand"
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"net"
|
|
"net/url"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/jackc/pgx/v5"
|
|
|
|
"github.com/example/remote-access-platform/backend/internal/platform/clusterauth"
|
|
)
|
|
|
|
var (
|
|
ErrAccessDenied = errors.New("platform admin role is required")
|
|
ErrInvalidPayload = errors.New("invalid cluster payload")
|
|
ErrInvalidJoinToken = errors.New("invalid or expired join token")
|
|
ErrInvalidNodeRole = errors.New("invalid node role")
|
|
ErrInvalidCluster = errors.New("cluster not found")
|
|
ErrInvalidJoinRequest = errors.New("join request not found")
|
|
ErrClusterReadOnly = errors.New("cluster is not authoritative for policy mutation")
|
|
ErrInvalidVPNConnection = errors.New("vpn connection not found")
|
|
ErrInvalidVPNLease = errors.New("vpn connection lease not found")
|
|
ErrVPNLeaseAlreadyActive = errors.New("vpn connection already has an active lease")
|
|
ErrVPNLeaseOwnerNotAllowed = errors.New("vpn lease owner is not allowed")
|
|
ErrVPNLeaseOwnerRoleRequired = errors.New("vpn lease owner requires active vpn-exit or vpn-connector role")
|
|
)
|
|
|
|
type Service struct {
|
|
store Repository
|
|
now func() time.Time
|
|
fabricServiceChannelLeaseMu sync.Mutex
|
|
fabricServiceChannelLeaseCache map[string]FabricServiceChannelLease
|
|
}
|
|
|
|
const fabricServiceChannelFeedbackMaxAge = 2 * time.Minute
|
|
const fabricServiceChannelOperatorExpireCooldown = 2 * time.Minute
|
|
|
|
func NewService(store Repository) *Service {
|
|
return &Service{store: store, now: func() time.Time { return time.Now().UTC() }, fabricServiceChannelLeaseCache: map[string]FabricServiceChannelLease{}}
|
|
}
|
|
|
|
const (
|
|
clusterJoinTokenAuthoritySchema = "rap.cluster.join_token.v1"
|
|
clusterNodeApprovalAuthoritySchema = "rap.cluster.node_approval.v1"
|
|
clusterMeshConfigAuthoritySchema = "rap.cluster.mesh_config_snapshot.v1"
|
|
)
|
|
|
|
type clusterJoinTokenAuthorityPayload struct {
|
|
SchemaVersion string `json:"schema_version"`
|
|
ClusterID string `json:"cluster_id"`
|
|
TokenID string `json:"token_id"`
|
|
Scope json.RawMessage `json:"scope"`
|
|
ExpiresAt time.Time `json:"expires_at"`
|
|
MaxUses int `json:"max_uses"`
|
|
CreatedByUserID *string `json:"created_by_user_id,omitempty"`
|
|
IssuedAt time.Time `json:"issued_at"`
|
|
ControlPlaneOnly bool `json:"control_plane_only"`
|
|
ProductionForwarding bool `json:"production_forwarding"`
|
|
}
|
|
|
|
type clusterNodeApprovalAuthorityPayload struct {
|
|
SchemaVersion string `json:"schema_version"`
|
|
ClusterID string `json:"cluster_id"`
|
|
JoinRequestID string `json:"join_request_id"`
|
|
NodeID string `json:"node_id"`
|
|
NodeFingerprint string `json:"node_fingerprint"`
|
|
IdentityStatus string `json:"identity_status"`
|
|
HeartbeatEndpoint string `json:"heartbeat_endpoint"`
|
|
ApprovedByUserID string `json:"approved_by_user_id"`
|
|
IssuedAt time.Time `json:"issued_at"`
|
|
ControlPlaneOnly bool `json:"control_plane_only"`
|
|
ProductionForwarding bool `json:"production_forwarding"`
|
|
}
|
|
|
|
type clusterMeshConfigAuthorityPayload struct {
|
|
SchemaVersion string `json:"schema_version"`
|
|
ClusterID string `json:"cluster_id"`
|
|
LocalNodeID string `json:"local_node_id"`
|
|
ConfigVersion string `json:"config_version"`
|
|
ConfigSHA256 string `json:"config_sha256"`
|
|
IssuedAt time.Time `json:"issued_at"`
|
|
ExpiresAt time.Time `json:"expires_at"`
|
|
ControlPlaneOnly bool `json:"control_plane_only"`
|
|
ProductionForwarding bool `json:"production_forwarding"`
|
|
}
|
|
|
|
func (s *Service) ListClusters(ctx context.Context, actorUserID string) ([]Cluster, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListClusters(ctx)
|
|
}
|
|
|
|
func (s *Service) GetCluster(ctx context.Context, actorUserID, clusterID string) (Cluster, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return Cluster{}, err
|
|
}
|
|
item, err := s.store.GetCluster(ctx, clusterID)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return Cluster{}, ErrInvalidCluster
|
|
}
|
|
return item, err
|
|
}
|
|
|
|
func (s *Service) GetFabricServiceChannelRecoveryPolicy(ctx context.Context, actorUserID, clusterID string) (FabricServiceChannelRecoveryPolicy, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return FabricServiceChannelRecoveryPolicy{}, err
|
|
}
|
|
cluster, err := s.store.GetCluster(ctx, strings.TrimSpace(clusterID))
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return FabricServiceChannelRecoveryPolicy{}, ErrInvalidCluster
|
|
}
|
|
if err != nil {
|
|
return FabricServiceChannelRecoveryPolicy{}, err
|
|
}
|
|
return fabricServiceChannelRecoveryPolicyFromCluster(cluster), nil
|
|
}
|
|
|
|
func (s *Service) UpdateFabricServiceChannelRecoveryPolicy(ctx context.Context, input UpdateFabricServiceChannelRecoveryPolicyInput) (FabricServiceChannelRecoveryPolicy, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return FabricServiceChannelRecoveryPolicy{}, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
if input.ClusterID == "" {
|
|
return FabricServiceChannelRecoveryPolicy{}, ErrInvalidCluster
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return FabricServiceChannelRecoveryPolicy{}, err
|
|
}
|
|
cluster, err := s.store.GetCluster(ctx, input.ClusterID)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return FabricServiceChannelRecoveryPolicy{}, ErrInvalidCluster
|
|
}
|
|
if err != nil {
|
|
return FabricServiceChannelRecoveryPolicy{}, err
|
|
}
|
|
policy := fabricServiceChannelRecoveryPolicyFromCluster(cluster)
|
|
if input.HysteresisPenalty > 0 {
|
|
policy.HysteresisPenalty = clampInt(input.HysteresisPenalty, 0, 10000)
|
|
}
|
|
if input.PromotionMinSamples > 0 {
|
|
policy.PromotionMinSamples = clampInt(input.PromotionMinSamples, 1, 100000)
|
|
}
|
|
if input.DemotionFailureThreshold > 0 {
|
|
policy.DemotionFailureThreshold = clampInt(input.DemotionFailureThreshold, 1, 100000)
|
|
}
|
|
if input.DemotionDropThreshold > 0 {
|
|
policy.DemotionDropThreshold = clampInt(input.DemotionDropThreshold, 1, 100000)
|
|
}
|
|
if input.DemotionSlowThreshold > 0 {
|
|
policy.DemotionSlowThreshold = clampInt(input.DemotionSlowThreshold, 1, 100000)
|
|
}
|
|
if input.DemotionRebuildEnabled != nil {
|
|
policy.DemotionRebuildEnabled = *input.DemotionRebuildEnabled
|
|
}
|
|
if input.DemotionFencedEnabled != nil {
|
|
policy.DemotionFencedEnabled = *input.DemotionFencedEnabled
|
|
}
|
|
now := s.now().UTC()
|
|
policy.SchemaVersion = "rap.fabric_service_channel_recovery_policy.v1"
|
|
policy.Source = "cluster_metadata"
|
|
policy.UpdatedByUserID = &input.ActorUserID
|
|
policy.UpdatedAt = now
|
|
policy.ControlPlaneOnly = true
|
|
policy.ProductionForwarding = false
|
|
metadata, err := upsertFabricServiceChannelRecoveryPolicyMetadata(cluster.Metadata, policy)
|
|
if err != nil {
|
|
return FabricServiceChannelRecoveryPolicy{}, err
|
|
}
|
|
updated, err := s.store.UpdateCluster(ctx, UpdateClusterInput{
|
|
ActorUserID: input.ActorUserID,
|
|
ClusterID: cluster.ID,
|
|
Name: cluster.Name,
|
|
Status: cluster.Status,
|
|
Region: cluster.Region,
|
|
Metadata: metadata,
|
|
})
|
|
if err != nil {
|
|
return FabricServiceChannelRecoveryPolicy{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &cluster.ID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "fabric.service_channel.recovery_policy.updated",
|
|
TargetType: "cluster",
|
|
TargetID: &cluster.ID,
|
|
Payload: metadata,
|
|
CreatedAt: now,
|
|
})
|
|
return fabricServiceChannelRecoveryPolicyFromCluster(updated), nil
|
|
}
|
|
|
|
func (s *Service) GetFabricServiceChannelAdaptivePolicy(ctx context.Context, actorUserID, clusterID string) (FabricServiceChannelAdaptivePolicy, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return FabricServiceChannelAdaptivePolicy{}, err
|
|
}
|
|
cluster, err := s.store.GetCluster(ctx, strings.TrimSpace(clusterID))
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return FabricServiceChannelAdaptivePolicy{}, ErrInvalidCluster
|
|
}
|
|
if err != nil {
|
|
return FabricServiceChannelAdaptivePolicy{}, err
|
|
}
|
|
return fabricServiceChannelAdaptivePolicyFromCluster(cluster), nil
|
|
}
|
|
|
|
func (s *Service) UpdateFabricServiceChannelAdaptivePolicy(ctx context.Context, input UpdateFabricServiceChannelAdaptivePolicyInput) (FabricServiceChannelAdaptivePolicy, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return FabricServiceChannelAdaptivePolicy{}, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
if input.ClusterID == "" {
|
|
return FabricServiceChannelAdaptivePolicy{}, ErrInvalidCluster
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return FabricServiceChannelAdaptivePolicy{}, err
|
|
}
|
|
cluster, err := s.store.GetCluster(ctx, input.ClusterID)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return FabricServiceChannelAdaptivePolicy{}, ErrInvalidCluster
|
|
}
|
|
if err != nil {
|
|
return FabricServiceChannelAdaptivePolicy{}, err
|
|
}
|
|
policy := fabricServiceChannelAdaptivePolicyFromCluster(cluster)
|
|
if input.MaxParallelWindow > 0 {
|
|
policy.MaxParallelWindow = clampInt(input.MaxParallelWindow, 1, 64)
|
|
}
|
|
if input.BulkPressureChannelThreshold > 0 {
|
|
policy.BulkPressureChannelThreshold = clampInt(input.BulkPressureChannelThreshold, 1, 100000)
|
|
}
|
|
if input.QueuePressureHighWatermark > 0 {
|
|
policy.QueuePressureHighWatermark = clampInt(input.QueuePressureHighWatermark, 1, 100000)
|
|
}
|
|
if input.QueuePressureMaxInFlight > 0 {
|
|
policy.QueuePressureMaxInFlight = clampInt(input.QueuePressureMaxInFlight, 1, 100000)
|
|
}
|
|
if len(input.ClassWindows) > 0 {
|
|
policy.ClassWindows = normalizeFabricServiceChannelAdaptiveClassWindows(input.ClassWindows, policy.MaxParallelWindow)
|
|
}
|
|
now := s.now().UTC()
|
|
policy.SchemaVersion = "rap.fabric_service_channel_adaptive_policy.v1"
|
|
policy.Source = "cluster_metadata"
|
|
policy.UpdatedByUserID = &input.ActorUserID
|
|
policy.UpdatedAt = now
|
|
policy.ControlPlaneOnly = true
|
|
policy.ProductionForwarding = false
|
|
metadata, err := upsertFabricServiceChannelAdaptivePolicyMetadata(cluster.Metadata, policy)
|
|
if err != nil {
|
|
return FabricServiceChannelAdaptivePolicy{}, err
|
|
}
|
|
updated, err := s.store.UpdateCluster(ctx, UpdateClusterInput{
|
|
ActorUserID: input.ActorUserID,
|
|
ClusterID: cluster.ID,
|
|
Name: cluster.Name,
|
|
Status: cluster.Status,
|
|
Region: cluster.Region,
|
|
Metadata: metadata,
|
|
})
|
|
if err != nil {
|
|
return FabricServiceChannelAdaptivePolicy{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &cluster.ID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "fabric.service_channel.adaptive_policy.updated",
|
|
TargetType: "cluster",
|
|
TargetID: &cluster.ID,
|
|
Payload: metadata,
|
|
CreatedAt: now,
|
|
})
|
|
return fabricServiceChannelAdaptivePolicyFromCluster(updated), nil
|
|
}
|
|
|
|
func (s *Service) GetFabricServiceChannelPoolPolicy(ctx context.Context, actorUserID, clusterID string) (FabricServiceChannelPoolPolicy, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return FabricServiceChannelPoolPolicy{}, err
|
|
}
|
|
cluster, err := s.store.GetCluster(ctx, strings.TrimSpace(clusterID))
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return FabricServiceChannelPoolPolicy{}, ErrInvalidCluster
|
|
}
|
|
if err != nil {
|
|
return FabricServiceChannelPoolPolicy{}, err
|
|
}
|
|
return fabricServiceChannelPoolPolicyFromCluster(cluster), nil
|
|
}
|
|
|
|
func (s *Service) UpdateFabricServiceChannelPoolPolicy(ctx context.Context, input UpdateFabricServiceChannelPoolPolicyInput) (FabricServiceChannelPoolPolicy, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return FabricServiceChannelPoolPolicy{}, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
if input.ClusterID == "" {
|
|
return FabricServiceChannelPoolPolicy{}, ErrInvalidCluster
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return FabricServiceChannelPoolPolicy{}, err
|
|
}
|
|
cluster, err := s.store.GetCluster(ctx, input.ClusterID)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return FabricServiceChannelPoolPolicy{}, ErrInvalidCluster
|
|
}
|
|
if err != nil {
|
|
return FabricServiceChannelPoolPolicy{}, err
|
|
}
|
|
policy := fabricServiceChannelPoolPolicyFromCluster(cluster)
|
|
policy.EntryPoolNodeIDs = dedupeStrings(input.EntryPoolNodeIDs)
|
|
policy.ExitPoolNodeIDs = dedupeStrings(input.ExitPoolNodeIDs)
|
|
policy.PreferredEntryNodeID = strings.TrimSpace(input.PreferredEntryNodeID)
|
|
policy.PreferredExitNodeID = strings.TrimSpace(input.PreferredExitNodeID)
|
|
if input.SelectionStrategy != "" {
|
|
policy.SelectionStrategy = strings.TrimSpace(input.SelectionStrategy)
|
|
}
|
|
if input.RouteRebuild != "" {
|
|
policy.RouteRebuild = strings.TrimSpace(input.RouteRebuild)
|
|
}
|
|
if input.EntryFailover != "" {
|
|
policy.EntryFailover = strings.TrimSpace(input.EntryFailover)
|
|
}
|
|
if input.ExitFailover != "" {
|
|
policy.ExitFailover = strings.TrimSpace(input.ExitFailover)
|
|
}
|
|
if input.BackendFallbackAllowed != nil {
|
|
policy.BackendFallbackAllowed = *input.BackendFallbackAllowed
|
|
}
|
|
if input.StickySession != nil {
|
|
policy.StickySession = *input.StickySession
|
|
}
|
|
now := s.now().UTC()
|
|
policy.SchemaVersion = "rap.fabric_service_channel_pool_policy.v1"
|
|
policy.Source = "cluster_metadata"
|
|
policy.UpdatedByUserID = &input.ActorUserID
|
|
policy.UpdatedAt = now
|
|
policy.ControlPlaneOnly = true
|
|
policy.ProductionForwarding = false
|
|
policy = normalizeFabricServiceChannelPoolPolicy(policy, defaultFabricServiceChannelPoolPolicy())
|
|
metadata, err := upsertFabricServiceChannelPoolPolicyMetadata(cluster.Metadata, policy)
|
|
if err != nil {
|
|
return FabricServiceChannelPoolPolicy{}, err
|
|
}
|
|
updated, err := s.store.UpdateCluster(ctx, UpdateClusterInput{
|
|
ActorUserID: input.ActorUserID,
|
|
ClusterID: cluster.ID,
|
|
Name: cluster.Name,
|
|
Status: cluster.Status,
|
|
Region: cluster.Region,
|
|
Metadata: metadata,
|
|
})
|
|
if err != nil {
|
|
return FabricServiceChannelPoolPolicy{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &cluster.ID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "fabric.service_channel.pool_policy.updated",
|
|
TargetType: "cluster",
|
|
TargetID: &cluster.ID,
|
|
Payload: metadata,
|
|
CreatedAt: now,
|
|
})
|
|
return fabricServiceChannelPoolPolicyFromCluster(updated), nil
|
|
}
|
|
|
|
func (s *Service) GetFabricServiceChannelBreadcrumbWindowPolicy(ctx context.Context, actorUserID, clusterID string) (FabricServiceChannelBreadcrumbWindowPolicy, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return FabricServiceChannelBreadcrumbWindowPolicy{}, err
|
|
}
|
|
cluster, err := s.store.GetCluster(ctx, strings.TrimSpace(clusterID))
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return FabricServiceChannelBreadcrumbWindowPolicy{}, ErrInvalidCluster
|
|
}
|
|
if err != nil {
|
|
return FabricServiceChannelBreadcrumbWindowPolicy{}, err
|
|
}
|
|
return fabricServiceChannelBreadcrumbWindowPolicyFromCluster(cluster), nil
|
|
}
|
|
|
|
func (s *Service) UpdateFabricServiceChannelBreadcrumbWindowPolicy(ctx context.Context, input UpdateFabricServiceChannelBreadcrumbWindowPolicyInput) (FabricServiceChannelBreadcrumbWindowPolicy, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return FabricServiceChannelBreadcrumbWindowPolicy{}, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
if input.ClusterID == "" {
|
|
return FabricServiceChannelBreadcrumbWindowPolicy{}, ErrInvalidCluster
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return FabricServiceChannelBreadcrumbWindowPolicy{}, err
|
|
}
|
|
cluster, err := s.store.GetCluster(ctx, input.ClusterID)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return FabricServiceChannelBreadcrumbWindowPolicy{}, ErrInvalidCluster
|
|
}
|
|
if err != nil {
|
|
return FabricServiceChannelBreadcrumbWindowPolicy{}, err
|
|
}
|
|
policy := fabricServiceChannelBreadcrumbWindowPolicyFromCluster(cluster)
|
|
if input.CurrentWindowSeconds > 0 {
|
|
policy.CurrentWindowSeconds = input.CurrentWindowSeconds
|
|
}
|
|
if input.HistoryWindowSeconds > 0 {
|
|
policy.HistoryWindowSeconds = input.HistoryWindowSeconds
|
|
}
|
|
now := s.now().UTC()
|
|
policy.SchemaVersion = "rap.fabric_service_channel_breadcrumb_window_policy.v1"
|
|
policy.Source = "cluster_metadata"
|
|
policy.UpdatedByUserID = &input.ActorUserID
|
|
policy.UpdatedAt = now
|
|
policy.ControlPlaneOnly = true
|
|
policy.ProductionForwarding = false
|
|
policy = normalizeFabricServiceChannelBreadcrumbWindowPolicy(policy, defaultFabricServiceChannelBreadcrumbWindowPolicy())
|
|
metadata, err := upsertFabricServiceChannelBreadcrumbWindowPolicyMetadata(cluster.Metadata, policy)
|
|
if err != nil {
|
|
return FabricServiceChannelBreadcrumbWindowPolicy{}, err
|
|
}
|
|
updated, err := s.store.UpdateCluster(ctx, UpdateClusterInput{
|
|
ActorUserID: input.ActorUserID,
|
|
ClusterID: cluster.ID,
|
|
Name: cluster.Name,
|
|
Status: cluster.Status,
|
|
Region: cluster.Region,
|
|
Metadata: metadata,
|
|
})
|
|
if err != nil {
|
|
return FabricServiceChannelBreadcrumbWindowPolicy{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &cluster.ID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "fabric.service_channel.breadcrumb_window_policy.updated",
|
|
TargetType: "cluster",
|
|
TargetID: &cluster.ID,
|
|
Payload: metadata,
|
|
CreatedAt: now,
|
|
})
|
|
return fabricServiceChannelBreadcrumbWindowPolicyFromCluster(updated), nil
|
|
}
|
|
|
|
func (s *Service) CreateCluster(ctx context.Context, input CreateClusterInput) (Cluster, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return Cluster{}, err
|
|
}
|
|
input.Slug = strings.TrimSpace(input.Slug)
|
|
input.Name = strings.TrimSpace(input.Name)
|
|
if input.Slug == "" || input.Name == "" {
|
|
return Cluster{}, ErrInvalidPayload
|
|
}
|
|
input.Metadata = defaultJSON(input.Metadata, `{}`)
|
|
if !json.Valid(input.Metadata) {
|
|
return Cluster{}, errors.New("metadata must be valid json")
|
|
}
|
|
item, err := s.store.CreateCluster(ctx, input)
|
|
if err != nil {
|
|
return Cluster{}, err
|
|
}
|
|
auditPayload := json.RawMessage(`{}`)
|
|
if authorityKey, err := s.ensureClusterAuthority(ctx, item.ID, &input.ActorUserID); err == nil {
|
|
auditPayload, _ = json.Marshal(map[string]any{
|
|
"cluster_authority": map[string]any{
|
|
"key_algorithm": authorityKey.KeyAlgorithm,
|
|
"public_key_fingerprint": authorityKey.PublicKeyFingerprint,
|
|
},
|
|
})
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &item.ID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "cluster.created",
|
|
TargetType: "cluster",
|
|
TargetID: &item.ID,
|
|
Payload: auditPayload,
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) ensureClusterAuthority(ctx context.Context, clusterID string, actorUserID *string) (ClusterAuthorityKey, error) {
|
|
authorityKey, err := s.store.GetClusterAuthority(ctx, clusterID)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return s.store.EnsureClusterAuthority(ctx, clusterID, actorUserID)
|
|
}
|
|
return authorityKey, err
|
|
}
|
|
|
|
func authorityDescriptor(authorityKey ClusterAuthorityKey) *ClusterAuthorityDescriptor {
|
|
descriptor := authorityKey.ClusterAuthorityDescriptor
|
|
if descriptor.SchemaVersion == "" {
|
|
descriptor.SchemaVersion = clusterauth.AuthoritySchemaVersion
|
|
}
|
|
return &descriptor
|
|
}
|
|
|
|
func defaultFabricServiceChannelRecoveryPolicy() FabricServiceChannelRecoveryPolicy {
|
|
return FabricServiceChannelRecoveryPolicy{
|
|
SchemaVersion: "rap.fabric_service_channel_recovery_policy.v1",
|
|
HysteresisPenalty: fabricServiceChannelRecoveryHysteresisPenalty,
|
|
PromotionMinSamples: fabricServiceChannelRecoveryPromotionMinSamples,
|
|
DemotionFailureThreshold: 1,
|
|
DemotionDropThreshold: 1,
|
|
DemotionSlowThreshold: 1,
|
|
DemotionRebuildEnabled: true,
|
|
DemotionFencedEnabled: true,
|
|
Source: "defaults",
|
|
ControlPlaneOnly: true,
|
|
ProductionForwarding: false,
|
|
}
|
|
}
|
|
|
|
func fabricServiceChannelRecoveryPolicyFromCluster(cluster Cluster) FabricServiceChannelRecoveryPolicy {
|
|
policy := defaultFabricServiceChannelRecoveryPolicy()
|
|
if len(cluster.Metadata) == 0 || !json.Valid(cluster.Metadata) {
|
|
return policy
|
|
}
|
|
var raw struct {
|
|
Policy *FabricServiceChannelRecoveryPolicy `json:"fabric_service_channel_recovery_policy"`
|
|
}
|
|
if err := json.Unmarshal(cluster.Metadata, &raw); err != nil || raw.Policy == nil {
|
|
return policy
|
|
}
|
|
policy = normalizeFabricServiceChannelRecoveryPolicy(*raw.Policy, policy)
|
|
policy.Source = "cluster_metadata"
|
|
return policy
|
|
}
|
|
|
|
func normalizeFabricServiceChannelRecoveryPolicy(input FabricServiceChannelRecoveryPolicy, fallback FabricServiceChannelRecoveryPolicy) FabricServiceChannelRecoveryPolicy {
|
|
if input.SchemaVersion == "" {
|
|
input.SchemaVersion = "rap.fabric_service_channel_recovery_policy.v1"
|
|
}
|
|
if input.HysteresisPenalty < 0 {
|
|
input.HysteresisPenalty = fallback.HysteresisPenalty
|
|
}
|
|
if input.HysteresisPenalty == 0 {
|
|
input.HysteresisPenalty = fallback.HysteresisPenalty
|
|
}
|
|
if input.PromotionMinSamples <= 0 {
|
|
input.PromotionMinSamples = fallback.PromotionMinSamples
|
|
}
|
|
if input.DemotionFailureThreshold <= 0 {
|
|
input.DemotionFailureThreshold = fallback.DemotionFailureThreshold
|
|
}
|
|
if input.DemotionDropThreshold <= 0 {
|
|
input.DemotionDropThreshold = fallback.DemotionDropThreshold
|
|
}
|
|
if input.DemotionSlowThreshold <= 0 {
|
|
input.DemotionSlowThreshold = fallback.DemotionSlowThreshold
|
|
}
|
|
if input.Source == "" {
|
|
input.Source = fallback.Source
|
|
}
|
|
input.ControlPlaneOnly = true
|
|
input.ProductionForwarding = false
|
|
input.Fingerprint = fabricServiceChannelRecoveryPolicyFingerprint(input)
|
|
return input
|
|
}
|
|
|
|
func upsertFabricServiceChannelRecoveryPolicyMetadata(metadata json.RawMessage, policy FabricServiceChannelRecoveryPolicy) (json.RawMessage, error) {
|
|
raw := map[string]any{}
|
|
if len(metadata) > 0 && json.Valid(metadata) {
|
|
if err := json.Unmarshal(metadata, &raw); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
raw["fabric_service_channel_recovery_policy"] = policy
|
|
out, err := json.Marshal(raw)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return json.RawMessage(out), nil
|
|
}
|
|
|
|
func fabricServiceChannelRecoveryPolicyRef(policy FabricServiceChannelRecoveryPolicy) *FabricServiceChannelRecoveryPolicy {
|
|
normalized := normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy())
|
|
return &normalized
|
|
}
|
|
|
|
func fabricServiceChannelRecoveryPolicyFingerprint(policy FabricServiceChannelRecoveryPolicy) string {
|
|
policy.Fingerprint = ""
|
|
policy.UpdatedAt = time.Time{}
|
|
policy.UpdatedByUserID = nil
|
|
raw, err := json.Marshal(struct {
|
|
SchemaVersion string `json:"schema_version"`
|
|
HysteresisPenalty int `json:"hysteresis_penalty"`
|
|
PromotionMinSamples int `json:"promotion_min_samples"`
|
|
DemotionFailureThreshold int `json:"demotion_failure_threshold"`
|
|
DemotionDropThreshold int `json:"demotion_drop_threshold"`
|
|
DemotionSlowThreshold int `json:"demotion_slow_threshold"`
|
|
DemotionRebuildEnabled bool `json:"demotion_rebuild_enabled"`
|
|
DemotionFencedEnabled bool `json:"demotion_fenced_enabled"`
|
|
ControlPlaneOnly bool `json:"control_plane_only"`
|
|
ProductionForwarding bool `json:"production_forwarding"`
|
|
}{
|
|
SchemaVersion: policy.SchemaVersion,
|
|
HysteresisPenalty: policy.HysteresisPenalty,
|
|
PromotionMinSamples: policy.PromotionMinSamples,
|
|
DemotionFailureThreshold: policy.DemotionFailureThreshold,
|
|
DemotionDropThreshold: policy.DemotionDropThreshold,
|
|
DemotionSlowThreshold: policy.DemotionSlowThreshold,
|
|
DemotionRebuildEnabled: policy.DemotionRebuildEnabled,
|
|
DemotionFencedEnabled: policy.DemotionFencedEnabled,
|
|
ControlPlaneOnly: true,
|
|
ProductionForwarding: false,
|
|
})
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
sum := sha256.Sum256(raw)
|
|
return hex.EncodeToString(sum[:])
|
|
}
|
|
|
|
func defaultFabricServiceChannelAdaptivePolicy() FabricServiceChannelAdaptivePolicy {
|
|
return normalizeFabricServiceChannelAdaptivePolicy(FabricServiceChannelAdaptivePolicy{
|
|
SchemaVersion: "rap.fabric_service_channel_adaptive_policy.v1",
|
|
MaxParallelWindow: 4,
|
|
BulkPressureChannelThreshold: 16,
|
|
QueuePressureHighWatermark: 16,
|
|
QueuePressureMaxInFlight: 16,
|
|
ClassWindows: map[string]int{
|
|
"control": 4,
|
|
"interactive": 4,
|
|
"reliable": 3,
|
|
"bulk": 1,
|
|
"droppable": 1,
|
|
},
|
|
Source: "defaults",
|
|
ControlPlaneOnly: true,
|
|
ProductionForwarding: false,
|
|
}, FabricServiceChannelAdaptivePolicy{})
|
|
}
|
|
|
|
func fabricServiceChannelAdaptivePolicyFromCluster(cluster Cluster) FabricServiceChannelAdaptivePolicy {
|
|
fallback := defaultFabricServiceChannelAdaptivePolicy()
|
|
if len(cluster.Metadata) == 0 || !json.Valid(cluster.Metadata) {
|
|
return fallback
|
|
}
|
|
var raw struct {
|
|
Policy *FabricServiceChannelAdaptivePolicy `json:"fabric_service_channel_adaptive_policy"`
|
|
}
|
|
if err := json.Unmarshal(cluster.Metadata, &raw); err != nil || raw.Policy == nil {
|
|
return fallback
|
|
}
|
|
policy := normalizeFabricServiceChannelAdaptivePolicy(*raw.Policy, fallback)
|
|
policy.Source = "cluster_metadata"
|
|
return policy
|
|
}
|
|
|
|
func normalizeFabricServiceChannelAdaptivePolicy(input FabricServiceChannelAdaptivePolicy, fallback FabricServiceChannelAdaptivePolicy) FabricServiceChannelAdaptivePolicy {
|
|
if input.SchemaVersion == "" {
|
|
input.SchemaVersion = "rap.fabric_service_channel_adaptive_policy.v1"
|
|
}
|
|
if fallback.MaxParallelWindow <= 0 {
|
|
fallback.MaxParallelWindow = 4
|
|
}
|
|
if input.MaxParallelWindow <= 0 {
|
|
input.MaxParallelWindow = fallback.MaxParallelWindow
|
|
}
|
|
input.MaxParallelWindow = clampInt(input.MaxParallelWindow, 1, 64)
|
|
if input.BulkPressureChannelThreshold <= 0 {
|
|
input.BulkPressureChannelThreshold = firstPositive(fallback.BulkPressureChannelThreshold, 16)
|
|
}
|
|
if input.QueuePressureHighWatermark <= 0 {
|
|
input.QueuePressureHighWatermark = firstPositive(fallback.QueuePressureHighWatermark, 16)
|
|
}
|
|
if input.QueuePressureMaxInFlight <= 0 {
|
|
input.QueuePressureMaxInFlight = firstPositive(fallback.QueuePressureMaxInFlight, 16)
|
|
}
|
|
input.ClassWindows = normalizeFabricServiceChannelAdaptiveClassWindows(firstNonNilStringIntMap(input.ClassWindows, fallback.ClassWindows), input.MaxParallelWindow)
|
|
if input.Source == "" {
|
|
input.Source = fallback.Source
|
|
}
|
|
if input.Source == "" {
|
|
input.Source = "defaults"
|
|
}
|
|
input.ControlPlaneOnly = true
|
|
input.ProductionForwarding = false
|
|
input.Fingerprint = fabricServiceChannelAdaptivePolicyFingerprint(input)
|
|
return input
|
|
}
|
|
|
|
func normalizeFabricServiceChannelAdaptiveClassWindows(values map[string]int, maxWindow int) map[string]int {
|
|
if maxWindow <= 0 {
|
|
maxWindow = 4
|
|
}
|
|
defaults := map[string]int{"control": maxWindow, "interactive": maxWindow, "reliable": boundedMinInt(maxWindow, 3), "bulk": 1, "droppable": 1}
|
|
out := map[string]int{}
|
|
for key, fallback := range defaults {
|
|
value := values[key]
|
|
if value <= 0 {
|
|
value = fallback
|
|
}
|
|
out[key] = clampInt(value, 1, maxWindow)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func upsertFabricServiceChannelAdaptivePolicyMetadata(metadata json.RawMessage, policy FabricServiceChannelAdaptivePolicy) (json.RawMessage, error) {
|
|
raw := map[string]any{}
|
|
if len(metadata) > 0 && json.Valid(metadata) {
|
|
if err := json.Unmarshal(metadata, &raw); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
raw["fabric_service_channel_adaptive_policy"] = policy
|
|
out, err := json.Marshal(raw)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return json.RawMessage(out), nil
|
|
}
|
|
|
|
func fabricServiceChannelAdaptivePolicyFingerprint(policy FabricServiceChannelAdaptivePolicy) string {
|
|
raw, err := json.Marshal(struct {
|
|
SchemaVersion string `json:"schema_version"`
|
|
MaxParallelWindow int `json:"max_parallel_window"`
|
|
BulkPressureChannelThreshold int `json:"bulk_pressure_channel_threshold"`
|
|
QueuePressureHighWatermark int `json:"queue_pressure_high_watermark"`
|
|
QueuePressureMaxInFlight int `json:"queue_pressure_max_in_flight"`
|
|
ClassWindows map[string]int `json:"class_windows"`
|
|
ControlPlaneOnly bool `json:"control_plane_only"`
|
|
ProductionForwarding bool `json:"production_forwarding"`
|
|
}{
|
|
SchemaVersion: policy.SchemaVersion,
|
|
MaxParallelWindow: policy.MaxParallelWindow,
|
|
BulkPressureChannelThreshold: policy.BulkPressureChannelThreshold,
|
|
QueuePressureHighWatermark: policy.QueuePressureHighWatermark,
|
|
QueuePressureMaxInFlight: policy.QueuePressureMaxInFlight,
|
|
ClassWindows: policy.ClassWindows,
|
|
ControlPlaneOnly: true,
|
|
ProductionForwarding: false,
|
|
})
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
sum := sha256.Sum256(raw)
|
|
return hex.EncodeToString(sum[:])
|
|
}
|
|
|
|
func defaultFabricServiceChannelPoolPolicy() FabricServiceChannelPoolPolicy {
|
|
return normalizeFabricServiceChannelPoolPolicy(FabricServiceChannelPoolPolicy{
|
|
SchemaVersion: "rap.fabric_service_channel_pool_policy.v1",
|
|
SelectionStrategy: "fastest_healthy",
|
|
RouteRebuild: "automatic",
|
|
EntryFailover: "automatic",
|
|
ExitFailover: "automatic",
|
|
BackendFallbackAllowed: true,
|
|
StickySession: true,
|
|
Source: "defaults",
|
|
ControlPlaneOnly: true,
|
|
ProductionForwarding: false,
|
|
}, FabricServiceChannelPoolPolicy{})
|
|
}
|
|
|
|
func fabricServiceChannelPoolPolicyFromCluster(cluster Cluster) FabricServiceChannelPoolPolicy {
|
|
fallback := defaultFabricServiceChannelPoolPolicy()
|
|
if len(cluster.Metadata) == 0 || !json.Valid(cluster.Metadata) {
|
|
return fallback
|
|
}
|
|
var raw struct {
|
|
Policy *FabricServiceChannelPoolPolicy `json:"fabric_service_channel_pool_policy"`
|
|
}
|
|
if err := json.Unmarshal(cluster.Metadata, &raw); err != nil || raw.Policy == nil {
|
|
return fallback
|
|
}
|
|
policy := normalizeFabricServiceChannelPoolPolicy(*raw.Policy, fallback)
|
|
policy.Source = "cluster_metadata"
|
|
return policy
|
|
}
|
|
|
|
func normalizeFabricServiceChannelPoolPolicy(input FabricServiceChannelPoolPolicy, fallback FabricServiceChannelPoolPolicy) FabricServiceChannelPoolPolicy {
|
|
if input.SchemaVersion == "" {
|
|
input.SchemaVersion = firstNonEmptyString(fallback.SchemaVersion, "rap.fabric_service_channel_pool_policy.v1")
|
|
}
|
|
input.EntryPoolNodeIDs = dedupeStrings(firstNonEmptyStringSlice(input.EntryPoolNodeIDs, fallback.EntryPoolNodeIDs))
|
|
input.ExitPoolNodeIDs = dedupeStrings(firstNonEmptyStringSlice(input.ExitPoolNodeIDs, fallback.ExitPoolNodeIDs))
|
|
input.PreferredEntryNodeID = strings.TrimSpace(firstNonEmptyString(input.PreferredEntryNodeID, fallback.PreferredEntryNodeID))
|
|
input.PreferredExitNodeID = strings.TrimSpace(firstNonEmptyString(input.PreferredExitNodeID, fallback.PreferredExitNodeID))
|
|
input.SelectionStrategy = normalizeFabricServiceChannelPoolPolicyMode(firstNonEmptyString(input.SelectionStrategy, fallback.SelectionStrategy), []string{"fastest_healthy", "preferred_first", "stable_first"}, "fastest_healthy")
|
|
input.RouteRebuild = normalizeFabricServiceChannelPoolPolicyMode(firstNonEmptyString(input.RouteRebuild, fallback.RouteRebuild), []string{"automatic", "manual", "disabled"}, "automatic")
|
|
input.EntryFailover = normalizeFabricServiceChannelPoolPolicyMode(firstNonEmptyString(input.EntryFailover, fallback.EntryFailover), []string{"automatic", "manual", "disabled"}, "automatic")
|
|
input.ExitFailover = normalizeFabricServiceChannelPoolPolicyMode(firstNonEmptyString(input.ExitFailover, fallback.ExitFailover), []string{"automatic", "manual", "disabled"}, "automatic")
|
|
if input.Source == "" {
|
|
input.Source = firstNonEmptyString(fallback.Source, "defaults")
|
|
}
|
|
input.ControlPlaneOnly = true
|
|
input.ProductionForwarding = false
|
|
input.Fingerprint = fabricServiceChannelPoolPolicyFingerprint(input)
|
|
return input
|
|
}
|
|
|
|
func normalizeFabricServiceChannelPoolPolicyMode(value string, allowed []string, fallback string) string {
|
|
value = strings.TrimSpace(strings.ToLower(value))
|
|
for _, item := range allowed {
|
|
if value == item {
|
|
return value
|
|
}
|
|
}
|
|
return fallback
|
|
}
|
|
|
|
func upsertFabricServiceChannelPoolPolicyMetadata(metadata json.RawMessage, policy FabricServiceChannelPoolPolicy) (json.RawMessage, error) {
|
|
raw := map[string]any{}
|
|
if len(metadata) > 0 && json.Valid(metadata) {
|
|
if err := json.Unmarshal(metadata, &raw); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
raw["fabric_service_channel_pool_policy"] = policy
|
|
out, err := json.Marshal(raw)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return json.RawMessage(out), nil
|
|
}
|
|
|
|
func fabricServiceChannelPoolPolicyRef(policy FabricServiceChannelPoolPolicy) *FabricServiceChannelPoolPolicy {
|
|
normalized := normalizeFabricServiceChannelPoolPolicy(policy, defaultFabricServiceChannelPoolPolicy())
|
|
return &normalized
|
|
}
|
|
|
|
func fabricServiceChannelPoolPolicyFingerprint(policy FabricServiceChannelPoolPolicy) string {
|
|
raw, err := json.Marshal(struct {
|
|
SchemaVersion string `json:"schema_version"`
|
|
EntryPoolNodeIDs []string `json:"entry_pool_node_ids,omitempty"`
|
|
ExitPoolNodeIDs []string `json:"exit_pool_node_ids,omitempty"`
|
|
PreferredEntryNodeID string `json:"preferred_entry_node_id,omitempty"`
|
|
PreferredExitNodeID string `json:"preferred_exit_node_id,omitempty"`
|
|
SelectionStrategy string `json:"selection_strategy"`
|
|
RouteRebuild string `json:"route_rebuild"`
|
|
EntryFailover string `json:"entry_failover"`
|
|
ExitFailover string `json:"exit_failover"`
|
|
BackendFallbackAllowed bool `json:"backend_fallback_allowed"`
|
|
StickySession bool `json:"sticky_session"`
|
|
ControlPlaneOnly bool `json:"control_plane_only"`
|
|
ProductionForwarding bool `json:"production_forwarding"`
|
|
}{
|
|
SchemaVersion: policy.SchemaVersion,
|
|
EntryPoolNodeIDs: policy.EntryPoolNodeIDs,
|
|
ExitPoolNodeIDs: policy.ExitPoolNodeIDs,
|
|
PreferredEntryNodeID: policy.PreferredEntryNodeID,
|
|
PreferredExitNodeID: policy.PreferredExitNodeID,
|
|
SelectionStrategy: policy.SelectionStrategy,
|
|
RouteRebuild: policy.RouteRebuild,
|
|
EntryFailover: policy.EntryFailover,
|
|
ExitFailover: policy.ExitFailover,
|
|
BackendFallbackAllowed: policy.BackendFallbackAllowed,
|
|
StickySession: policy.StickySession,
|
|
ControlPlaneOnly: true,
|
|
ProductionForwarding: false,
|
|
})
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
sum := sha256.Sum256(raw)
|
|
return hex.EncodeToString(sum[:])
|
|
}
|
|
|
|
func defaultFabricServiceChannelBreadcrumbWindowPolicy() FabricServiceChannelBreadcrumbWindowPolicy {
|
|
return normalizeFabricServiceChannelBreadcrumbWindowPolicy(FabricServiceChannelBreadcrumbWindowPolicy{
|
|
SchemaVersion: "rap.fabric_service_channel_breadcrumb_window_policy.v1",
|
|
CurrentWindowSeconds: int64((30 * time.Minute).Seconds()),
|
|
HistoryWindowSeconds: int64((24 * time.Hour).Seconds()),
|
|
Source: "defaults",
|
|
ControlPlaneOnly: true,
|
|
ProductionForwarding: false,
|
|
}, FabricServiceChannelBreadcrumbWindowPolicy{})
|
|
}
|
|
|
|
func fabricServiceChannelBreadcrumbWindowPolicyFromCluster(cluster Cluster) FabricServiceChannelBreadcrumbWindowPolicy {
|
|
fallback := defaultFabricServiceChannelBreadcrumbWindowPolicy()
|
|
if len(cluster.Metadata) == 0 || !json.Valid(cluster.Metadata) {
|
|
return fallback
|
|
}
|
|
var raw struct {
|
|
Policy *FabricServiceChannelBreadcrumbWindowPolicy `json:"fabric_service_channel_breadcrumb_window_policy"`
|
|
}
|
|
if err := json.Unmarshal(cluster.Metadata, &raw); err != nil || raw.Policy == nil {
|
|
return fallback
|
|
}
|
|
policy := normalizeFabricServiceChannelBreadcrumbWindowPolicy(*raw.Policy, fallback)
|
|
policy.Source = "cluster_metadata"
|
|
return policy
|
|
}
|
|
|
|
func normalizeFabricServiceChannelBreadcrumbWindowPolicy(input FabricServiceChannelBreadcrumbWindowPolicy, fallback FabricServiceChannelBreadcrumbWindowPolicy) FabricServiceChannelBreadcrumbWindowPolicy {
|
|
if input.SchemaVersion == "" {
|
|
input.SchemaVersion = firstNonEmptyString(fallback.SchemaVersion, "rap.fabric_service_channel_breadcrumb_window_policy.v1")
|
|
}
|
|
if input.CurrentWindowSeconds <= 0 {
|
|
input.CurrentWindowSeconds = firstPositiveInt64(fallback.CurrentWindowSeconds, int64((30 * time.Minute).Seconds()))
|
|
}
|
|
if input.HistoryWindowSeconds <= 0 {
|
|
input.HistoryWindowSeconds = firstPositiveInt64(fallback.HistoryWindowSeconds, int64((24 * time.Hour).Seconds()))
|
|
}
|
|
input.CurrentWindowSeconds = clampInt64(input.CurrentWindowSeconds, 60, int64((7 * 24 * time.Hour).Seconds()))
|
|
input.HistoryWindowSeconds = clampInt64(input.HistoryWindowSeconds, input.CurrentWindowSeconds, int64((30 * 24 * time.Hour).Seconds()))
|
|
if input.Source == "" {
|
|
input.Source = firstNonEmptyString(fallback.Source, "defaults")
|
|
}
|
|
input.ControlPlaneOnly = true
|
|
input.ProductionForwarding = false
|
|
input.Fingerprint = fabricServiceChannelBreadcrumbWindowPolicyFingerprint(input)
|
|
return input
|
|
}
|
|
|
|
func upsertFabricServiceChannelBreadcrumbWindowPolicyMetadata(metadata json.RawMessage, policy FabricServiceChannelBreadcrumbWindowPolicy) (json.RawMessage, error) {
|
|
raw := map[string]any{}
|
|
if len(metadata) > 0 && json.Valid(metadata) {
|
|
if err := json.Unmarshal(metadata, &raw); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
raw["fabric_service_channel_breadcrumb_window_policy"] = policy
|
|
out, err := json.Marshal(raw)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return json.RawMessage(out), nil
|
|
}
|
|
|
|
func fabricServiceChannelBreadcrumbWindowPolicyFingerprint(policy FabricServiceChannelBreadcrumbWindowPolicy) string {
|
|
raw, err := json.Marshal(struct {
|
|
SchemaVersion string `json:"schema_version"`
|
|
CurrentWindowSeconds int64 `json:"current_window_seconds"`
|
|
HistoryWindowSeconds int64 `json:"history_window_seconds"`
|
|
ControlPlaneOnly bool `json:"control_plane_only"`
|
|
ProductionForwarding bool `json:"production_forwarding"`
|
|
}{
|
|
SchemaVersion: policy.SchemaVersion,
|
|
CurrentWindowSeconds: policy.CurrentWindowSeconds,
|
|
HistoryWindowSeconds: policy.HistoryWindowSeconds,
|
|
ControlPlaneOnly: true,
|
|
ProductionForwarding: false,
|
|
})
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
sum := sha256.Sum256(raw)
|
|
return hex.EncodeToString(sum[:])
|
|
}
|
|
|
|
func firstNonEmptyStringSlice(values ...[]string) []string {
|
|
for _, value := range values {
|
|
if len(value) > 0 {
|
|
return value
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func firstPositive(values ...int) int {
|
|
for _, value := range values {
|
|
if value > 0 {
|
|
return value
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func firstPositiveInt64(values ...int64) int64 {
|
|
for _, value := range values {
|
|
if value > 0 {
|
|
return value
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func firstNonNilStringIntMap(values ...map[string]int) map[string]int {
|
|
for _, value := range values {
|
|
if len(value) > 0 {
|
|
return value
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func boundedMinInt(a, b int) int {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
func clampInt(value, minValue, maxValue int) int {
|
|
if value < minValue {
|
|
return minValue
|
|
}
|
|
if value > maxValue {
|
|
return maxValue
|
|
}
|
|
return value
|
|
}
|
|
|
|
func clampInt64(value, minValue, maxValue int64) int64 {
|
|
if value < minValue {
|
|
return minValue
|
|
}
|
|
if value > maxValue {
|
|
return maxValue
|
|
}
|
|
return value
|
|
}
|
|
|
|
func (s *Service) UpdateCluster(ctx context.Context, input UpdateClusterInput) (Cluster, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return Cluster{}, err
|
|
}
|
|
if input.ClusterID == "" {
|
|
return Cluster{}, ErrInvalidCluster
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return Cluster{}, err
|
|
}
|
|
input.Name = strings.TrimSpace(input.Name)
|
|
input.Status = strings.TrimSpace(input.Status)
|
|
if input.Name == "" {
|
|
return Cluster{}, ErrInvalidPayload
|
|
}
|
|
if input.Status == "" {
|
|
input.Status = ClusterStatusActive
|
|
}
|
|
if input.Status != ClusterStatusActive && input.Status != ClusterStatusDisabled {
|
|
return Cluster{}, ErrInvalidPayload
|
|
}
|
|
input.Metadata = defaultJSON(input.Metadata, `{}`)
|
|
if !json.Valid(input.Metadata) {
|
|
return Cluster{}, errors.New("metadata must be valid json")
|
|
}
|
|
item, err := s.store.UpdateCluster(ctx, input)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return Cluster{}, ErrInvalidCluster
|
|
}
|
|
if err != nil {
|
|
return Cluster{}, err
|
|
}
|
|
payload, _ := json.Marshal(map[string]any{
|
|
"name": item.Name,
|
|
"status": item.Status,
|
|
"region": item.Region,
|
|
})
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &item.ID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "cluster.updated",
|
|
TargetType: "cluster",
|
|
TargetID: &item.ID,
|
|
Payload: payload,
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) ListClusterNodes(ctx context.Context, actorUserID, clusterID string) ([]ClusterNode, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListClusterNodes(ctx, clusterID)
|
|
}
|
|
|
|
func (s *Service) ListNodeGroups(ctx context.Context, actorUserID, clusterID string) ([]ClusterNodeGroup, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListNodeGroups(ctx, clusterID)
|
|
}
|
|
|
|
func (s *Service) CreateNodeGroup(ctx context.Context, input CreateNodeGroupInput) (ClusterNodeGroup, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return ClusterNodeGroup{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return ClusterNodeGroup{}, err
|
|
}
|
|
input.Name = strings.TrimSpace(input.Name)
|
|
if input.ClusterID == "" || input.Name == "" {
|
|
return ClusterNodeGroup{}, ErrInvalidPayload
|
|
}
|
|
if input.Description != nil {
|
|
trimmed := strings.TrimSpace(*input.Description)
|
|
input.Description = &trimmed
|
|
}
|
|
input.Metadata = defaultJSON(input.Metadata, `{}`)
|
|
if !json.Valid(input.Metadata) {
|
|
return ClusterNodeGroup{}, errors.New("node group metadata must be valid json")
|
|
}
|
|
item, err := s.store.CreateNodeGroup(ctx, input)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return ClusterNodeGroup{}, ErrInvalidPayload
|
|
}
|
|
return item, err
|
|
}
|
|
|
|
func (s *Service) CreateJoinToken(ctx context.Context, input CreateJoinTokenInput) (CreatedJoinToken, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return CreatedJoinToken{}, err
|
|
}
|
|
if input.ClusterID == "" {
|
|
return CreatedJoinToken{}, ErrInvalidCluster
|
|
}
|
|
input.Scope = defaultJSON(input.Scope, `{}`)
|
|
if !json.Valid(input.Scope) {
|
|
return CreatedJoinToken{}, errors.New("scope must be valid json")
|
|
}
|
|
if input.ExpiresAt.IsZero() {
|
|
input.ExpiresAt = defaultJoinTokenExpiry(s.now())
|
|
}
|
|
if input.ExpiresAt.Before(s.now()) {
|
|
return CreatedJoinToken{}, errors.New("expires_at must be in the future")
|
|
}
|
|
if input.MaxUses <= 0 {
|
|
input.MaxUses = 1
|
|
}
|
|
rawToken, err := generateJoinToken()
|
|
if err != nil {
|
|
return CreatedJoinToken{}, err
|
|
}
|
|
tokenHash, err := hashJoinToken(rawToken)
|
|
if err != nil {
|
|
return CreatedJoinToken{}, err
|
|
}
|
|
item, err := s.store.CreateJoinToken(ctx, input, tokenHash)
|
|
if err != nil {
|
|
return CreatedJoinToken{}, err
|
|
}
|
|
item, err = s.signJoinToken(ctx, input, item)
|
|
if err != nil {
|
|
return CreatedJoinToken{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "node_join_token.created",
|
|
TargetType: "node_join_token",
|
|
TargetID: &item.ID,
|
|
Payload: json.RawMessage(`{"raw_token_returned_once":true}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return CreatedJoinToken{NodeJoinToken: item, Token: rawToken}, nil
|
|
}
|
|
|
|
func (s *Service) ListJoinTokens(ctx context.Context, actorUserID, clusterID string) ([]NodeJoinToken, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
if err := s.store.ExpireJoinTokens(ctx, clusterID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListJoinTokens(ctx, clusterID)
|
|
}
|
|
|
|
func (s *Service) GetDockerInstallProfile(ctx context.Context, input DockerInstallProfileRequest) (DockerInstallProfile, error) {
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.InstallToken = strings.TrimSpace(input.InstallToken)
|
|
if input.ClusterID == "" || input.InstallToken == "" {
|
|
return DockerInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
if err := s.store.ExpireJoinTokens(ctx, input.ClusterID); err != nil {
|
|
return DockerInstallProfile{}, err
|
|
}
|
|
tokenHash, err := hashJoinToken(input.InstallToken)
|
|
if err != nil {
|
|
return DockerInstallProfile{}, ErrInvalidJoinToken
|
|
}
|
|
token, err := s.store.GetValidJoinTokenByHash(ctx, input.ClusterID, tokenHash)
|
|
if err != nil {
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return DockerInstallProfile{}, ErrInvalidJoinToken
|
|
}
|
|
return DockerInstallProfile{}, err
|
|
}
|
|
profile, err := dockerInstallProfileFromScope(input, token.Scope)
|
|
if err != nil {
|
|
return DockerInstallProfile{}, err
|
|
}
|
|
profile.ClusterID = input.ClusterID
|
|
profile.JoinToken = input.InstallToken
|
|
return profile, nil
|
|
}
|
|
|
|
func (s *Service) GetWindowsInstallProfile(ctx context.Context, input DockerInstallProfileRequest) (WindowsInstallProfile, error) {
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.InstallToken = strings.TrimSpace(input.InstallToken)
|
|
if input.ClusterID == "" || input.InstallToken == "" {
|
|
return WindowsInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
if err := s.store.ExpireJoinTokens(ctx, input.ClusterID); err != nil {
|
|
return WindowsInstallProfile{}, err
|
|
}
|
|
tokenHash, err := hashJoinToken(input.InstallToken)
|
|
if err != nil {
|
|
return WindowsInstallProfile{}, ErrInvalidJoinToken
|
|
}
|
|
token, err := s.store.GetValidJoinTokenByHash(ctx, input.ClusterID, tokenHash)
|
|
if err != nil {
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return WindowsInstallProfile{}, ErrInvalidJoinToken
|
|
}
|
|
return WindowsInstallProfile{}, err
|
|
}
|
|
profile, err := windowsInstallProfileFromScope(input, token.Scope)
|
|
if err != nil {
|
|
return WindowsInstallProfile{}, err
|
|
}
|
|
profile.ClusterID = input.ClusterID
|
|
profile.JoinToken = input.InstallToken
|
|
return profile, nil
|
|
}
|
|
|
|
func (s *Service) GetLinuxInstallProfile(ctx context.Context, input DockerInstallProfileRequest) (LinuxInstallProfile, error) {
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.InstallToken = strings.TrimSpace(input.InstallToken)
|
|
if input.ClusterID == "" || input.InstallToken == "" {
|
|
return LinuxInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
if err := s.store.ExpireJoinTokens(ctx, input.ClusterID); err != nil {
|
|
return LinuxInstallProfile{}, err
|
|
}
|
|
tokenHash, err := hashJoinToken(input.InstallToken)
|
|
if err != nil {
|
|
return LinuxInstallProfile{}, ErrInvalidJoinToken
|
|
}
|
|
token, err := s.store.GetValidJoinTokenByHash(ctx, input.ClusterID, tokenHash)
|
|
if err != nil {
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return LinuxInstallProfile{}, ErrInvalidJoinToken
|
|
}
|
|
return LinuxInstallProfile{}, err
|
|
}
|
|
profile, err := linuxInstallProfileFromScope(input, token.Scope)
|
|
if err != nil {
|
|
return LinuxInstallProfile{}, err
|
|
}
|
|
profile.ClusterID = input.ClusterID
|
|
profile.JoinToken = input.InstallToken
|
|
return profile, nil
|
|
}
|
|
|
|
func (s *Service) signJoinToken(ctx context.Context, input CreateJoinTokenInput, item NodeJoinToken) (NodeJoinToken, error) {
|
|
authorityKey, err := s.ensureClusterAuthority(ctx, input.ClusterID, &input.ActorUserID)
|
|
if err != nil {
|
|
return NodeJoinToken{}, err
|
|
}
|
|
payload := clusterJoinTokenAuthorityPayload{
|
|
SchemaVersion: clusterJoinTokenAuthoritySchema,
|
|
ClusterID: input.ClusterID,
|
|
TokenID: item.ID,
|
|
Scope: item.Scope,
|
|
ExpiresAt: item.ExpiresAt,
|
|
MaxUses: item.MaxUses,
|
|
CreatedByUserID: item.CreatedByUserID,
|
|
IssuedAt: item.CreatedAt,
|
|
ControlPlaneOnly: true,
|
|
ProductionForwarding: false,
|
|
}
|
|
rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now())
|
|
if err != nil {
|
|
return NodeJoinToken{}, err
|
|
}
|
|
return s.store.SetJoinTokenAuthority(ctx, input.ClusterID, item.ID, rawPayload, signature)
|
|
}
|
|
|
|
func (s *Service) CreateJoinRequest(ctx context.Context, input CreateJoinRequestInput) (NodeJoinRequest, error) {
|
|
if input.ClusterID == "" {
|
|
return NodeJoinRequest{}, ErrInvalidCluster
|
|
}
|
|
if err := s.store.ExpireJoinTokens(ctx, input.ClusterID); err != nil {
|
|
return NodeJoinRequest{}, err
|
|
}
|
|
input.NodeName = strings.TrimSpace(input.NodeName)
|
|
input.NodeFingerprint = strings.TrimSpace(input.NodeFingerprint)
|
|
input.PublicKey = strings.TrimSpace(input.PublicKey)
|
|
if input.NodeName == "" || input.NodeFingerprint == "" || input.PublicKey == "" {
|
|
return NodeJoinRequest{}, ErrInvalidPayload
|
|
}
|
|
input.ReportedCapabilities = defaultJSON(input.ReportedCapabilities, `{}`)
|
|
input.ReportedFacts = defaultJSON(input.ReportedFacts, `{}`)
|
|
input.RequestedRoles = defaultJSON(input.RequestedRoles, `[]`)
|
|
if !json.Valid(input.ReportedCapabilities) || !json.Valid(input.ReportedFacts) || !json.Valid(input.RequestedRoles) {
|
|
return NodeJoinRequest{}, errors.New("reported_capabilities, reported_facts, and requested_roles must be valid json")
|
|
}
|
|
tokenHash, err := hashJoinToken(input.JoinToken)
|
|
if err != nil {
|
|
return NodeJoinRequest{}, ErrInvalidJoinToken
|
|
}
|
|
token, err := s.store.GetValidJoinTokenByHash(ctx, input.ClusterID, tokenHash)
|
|
if err != nil {
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return NodeJoinRequest{}, ErrInvalidJoinToken
|
|
}
|
|
return NodeJoinRequest{}, err
|
|
}
|
|
item, err := s.store.CreateJoinRequest(ctx, input, token.ID)
|
|
if err != nil {
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return NodeJoinRequest{}, ErrInvalidJoinToken
|
|
}
|
|
return NodeJoinRequest{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
EventType: "node_join_request.created",
|
|
TargetType: "node_join_request",
|
|
TargetID: &item.ID,
|
|
Payload: json.RawMessage(`{"source":"node_agent"}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) ListJoinRequests(ctx context.Context, actorUserID, clusterID string) ([]NodeJoinRequest, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListJoinRequests(ctx, clusterID)
|
|
}
|
|
|
|
func (s *Service) GetJoinRequestBootstrap(ctx context.Context, input GetJoinRequestBootstrapInput) (JoinRequestBootstrapResult, error) {
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.JoinRequestID = strings.TrimSpace(input.JoinRequestID)
|
|
input.NodeFingerprint = strings.TrimSpace(input.NodeFingerprint)
|
|
input.PublicKey = strings.TrimSpace(input.PublicKey)
|
|
if input.ClusterID == "" || input.JoinRequestID == "" || input.NodeFingerprint == "" || input.PublicKey == "" {
|
|
return JoinRequestBootstrapResult{}, ErrInvalidJoinRequest
|
|
}
|
|
item, err := s.store.GetJoinRequestForBootstrap(ctx, input)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return JoinRequestBootstrapResult{}, ErrInvalidJoinRequest
|
|
}
|
|
if err != nil {
|
|
return JoinRequestBootstrapResult{}, err
|
|
}
|
|
result := JoinRequestBootstrapResult{Status: item.Status, JoinRequest: item}
|
|
if item.Status != JoinRequestStatusApproved {
|
|
return result, nil
|
|
}
|
|
bootstrap, updated, err := s.bootstrapForApprovedJoinRequest(ctx, item)
|
|
if err != nil {
|
|
return JoinRequestBootstrapResult{}, err
|
|
}
|
|
result.JoinRequest = updated
|
|
result.Bootstrap = &bootstrap
|
|
return result, nil
|
|
}
|
|
|
|
func (s *Service) RevokeJoinToken(ctx context.Context, input RevokeJoinTokenInput) (NodeJoinToken, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return NodeJoinToken{}, err
|
|
}
|
|
item, err := s.store.RevokeJoinToken(ctx, input)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return NodeJoinToken{}, ErrInvalidJoinToken
|
|
}
|
|
if err != nil {
|
|
return NodeJoinToken{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "node_join_token.revoked",
|
|
TargetType: "node_join_token",
|
|
TargetID: &input.TokenID,
|
|
Payload: json.RawMessage(`{}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) ApproveJoinRequest(ctx context.Context, input ApproveJoinRequestInput) (ApprovedJoinRequest, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return ApprovedJoinRequest{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return ApprovedJoinRequest{}, err
|
|
}
|
|
if input.ClusterID == "" || input.JoinRequestID == "" {
|
|
return ApprovedJoinRequest{}, ErrInvalidJoinRequest
|
|
}
|
|
item, err := s.store.ApproveJoinRequest(ctx, input)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return ApprovedJoinRequest{}, ErrInvalidJoinRequest
|
|
}
|
|
if err != nil {
|
|
return ApprovedJoinRequest{}, err
|
|
}
|
|
item, err = s.signApprovedJoinRequest(ctx, input, item)
|
|
if err != nil {
|
|
return ApprovedJoinRequest{}, err
|
|
}
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) signApprovedJoinRequest(ctx context.Context, input ApproveJoinRequestInput, item ApprovedJoinRequest) (ApprovedJoinRequest, error) {
|
|
authorityKey, err := s.ensureClusterAuthority(ctx, input.ClusterID, &input.ActorUserID)
|
|
if err != nil {
|
|
return ApprovedJoinRequest{}, err
|
|
}
|
|
if item.Bootstrap.HeartbeatEndpoint == "" {
|
|
item.Bootstrap.HeartbeatEndpoint = nodeHeartbeatEndpoint(input.ClusterID, item.Bootstrap.NodeID)
|
|
}
|
|
payload := clusterNodeApprovalAuthorityPayload{
|
|
SchemaVersion: clusterNodeApprovalAuthoritySchema,
|
|
ClusterID: input.ClusterID,
|
|
JoinRequestID: item.JoinRequest.ID,
|
|
NodeID: item.Bootstrap.NodeID,
|
|
NodeFingerprint: item.JoinRequest.NodeFingerprint,
|
|
IdentityStatus: item.Bootstrap.IdentityStatus,
|
|
HeartbeatEndpoint: item.Bootstrap.HeartbeatEndpoint,
|
|
ApprovedByUserID: input.ActorUserID,
|
|
IssuedAt: s.now(),
|
|
ControlPlaneOnly: true,
|
|
ProductionForwarding: false,
|
|
}
|
|
rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now())
|
|
if err != nil {
|
|
return ApprovedJoinRequest{}, err
|
|
}
|
|
updated, err := s.store.SetJoinRequestApprovalAuthority(ctx, input.ClusterID, item.JoinRequest.ID, rawPayload, signature)
|
|
if err != nil {
|
|
return ApprovedJoinRequest{}, err
|
|
}
|
|
item.JoinRequest = updated
|
|
item.Bootstrap.ClusterAuthority = authorityDescriptor(authorityKey)
|
|
item.Bootstrap.AuthorityPayload = rawPayload
|
|
item.Bootstrap.AuthoritySignature = &signature
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) bootstrapForApprovedJoinRequest(ctx context.Context, item NodeJoinRequest) (NodeBootstrap, NodeJoinRequest, error) {
|
|
if item.Status != JoinRequestStatusApproved || item.ApprovedNodeID == nil || strings.TrimSpace(*item.ApprovedNodeID) == "" {
|
|
return NodeBootstrap{}, NodeJoinRequest{}, ErrInvalidJoinRequest
|
|
}
|
|
authorityKey, err := s.ensureClusterAuthority(ctx, item.ClusterID, item.ReviewedByUserID)
|
|
if err != nil {
|
|
return NodeBootstrap{}, NodeJoinRequest{}, err
|
|
}
|
|
heartbeatEndpoint := nodeHeartbeatEndpoint(item.ClusterID, *item.ApprovedNodeID)
|
|
identityStatus := NodeRegistrationActive
|
|
if rawMessageEmpty(item.ApprovalPayload) || rawMessageEmpty(item.ApprovalSignature) {
|
|
approvedBy := "system"
|
|
if item.ReviewedByUserID != nil && strings.TrimSpace(*item.ReviewedByUserID) != "" {
|
|
approvedBy = strings.TrimSpace(*item.ReviewedByUserID)
|
|
}
|
|
payload := clusterNodeApprovalAuthorityPayload{
|
|
SchemaVersion: clusterNodeApprovalAuthoritySchema,
|
|
ClusterID: item.ClusterID,
|
|
JoinRequestID: item.ID,
|
|
NodeID: *item.ApprovedNodeID,
|
|
NodeFingerprint: item.NodeFingerprint,
|
|
IdentityStatus: identityStatus,
|
|
HeartbeatEndpoint: heartbeatEndpoint,
|
|
ApprovedByUserID: approvedBy,
|
|
IssuedAt: s.now(),
|
|
ControlPlaneOnly: true,
|
|
ProductionForwarding: false,
|
|
}
|
|
rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now())
|
|
if err != nil {
|
|
return NodeBootstrap{}, NodeJoinRequest{}, err
|
|
}
|
|
item, err = s.store.SetJoinRequestApprovalAuthority(ctx, item.ClusterID, item.ID, rawPayload, signature)
|
|
if err != nil {
|
|
return NodeBootstrap{}, NodeJoinRequest{}, err
|
|
}
|
|
} else {
|
|
var signature ClusterSignature
|
|
if err := json.Unmarshal(item.ApprovalSignature, &signature); err != nil {
|
|
return NodeBootstrap{}, NodeJoinRequest{}, err
|
|
}
|
|
if err := clusterauth.VerifyRaw(authorityKey.PublicKey, item.ApprovalPayload, signature); err != nil {
|
|
return NodeBootstrap{}, NodeJoinRequest{}, err
|
|
}
|
|
}
|
|
var signature ClusterSignature
|
|
if err := json.Unmarshal(item.ApprovalSignature, &signature); err != nil {
|
|
return NodeBootstrap{}, NodeJoinRequest{}, err
|
|
}
|
|
bootstrap := NodeBootstrap{
|
|
NodeID: *item.ApprovedNodeID,
|
|
ClusterID: item.ClusterID,
|
|
IdentityStatus: identityStatus,
|
|
Certificate: map[string]any{
|
|
"status": "pending_issuer_integration",
|
|
},
|
|
HeartbeatEndpoint: heartbeatEndpoint,
|
|
ClusterAuthority: authorityDescriptor(authorityKey),
|
|
AuthorityPayload: item.ApprovalPayload,
|
|
AuthoritySignature: &signature,
|
|
}
|
|
return bootstrap, item, nil
|
|
}
|
|
|
|
func nodeHeartbeatEndpoint(clusterID, nodeID string) string {
|
|
return "/api/v1/clusters/" + clusterID + "/nodes/" + nodeID + "/heartbeats"
|
|
}
|
|
|
|
func rawMessageEmpty(raw json.RawMessage) bool {
|
|
value := strings.TrimSpace(string(raw))
|
|
return value == "" || value == "{}" || value == "null"
|
|
}
|
|
|
|
func (s *Service) RejectJoinRequest(ctx context.Context, input RejectJoinRequestInput) (NodeJoinRequest, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return NodeJoinRequest{}, err
|
|
}
|
|
input.Reason = strings.TrimSpace(input.Reason)
|
|
if input.Reason == "" {
|
|
input.Reason = "Rejected by platform administrator."
|
|
}
|
|
item, err := s.store.RejectJoinRequest(ctx, input)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return NodeJoinRequest{}, ErrInvalidJoinRequest
|
|
}
|
|
return item, err
|
|
}
|
|
|
|
func (s *Service) AssignNodeRole(ctx context.Context, input AssignNodeRoleInput) (NodeRoleAssignment, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return NodeRoleAssignment{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return NodeRoleAssignment{}, err
|
|
}
|
|
if !isAllowedNodeRole(input.Role) {
|
|
return NodeRoleAssignment{}, ErrInvalidNodeRole
|
|
}
|
|
if input.Status == "" {
|
|
input.Status = "active"
|
|
}
|
|
if input.Status != "active" && input.Status != "disabled" && input.Status != "revoked" {
|
|
return NodeRoleAssignment{}, ErrInvalidPayload
|
|
}
|
|
input.Policy = defaultJSON(input.Policy, `{}`)
|
|
if !json.Valid(input.Policy) {
|
|
return NodeRoleAssignment{}, errors.New("policy must be valid json")
|
|
}
|
|
item, err := s.store.AssignNodeRole(ctx, input)
|
|
if err != nil {
|
|
return NodeRoleAssignment{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "node_role." + input.Status,
|
|
TargetType: "node",
|
|
TargetID: &input.NodeID,
|
|
Payload: json.RawMessage(`{"capability_is_not_permission":true}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) ListNodeRoleAssignments(ctx context.Context, actorUserID, clusterID, nodeID string) ([]NodeRoleAssignment, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListNodeRoleAssignments(ctx, clusterID, nodeID)
|
|
}
|
|
|
|
func (s *Service) AttachExistingNodeToCluster(ctx context.Context, input AttachExistingNodeInput) (ClusterNode, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return ClusterNode{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return ClusterNode{}, err
|
|
}
|
|
if input.ClusterID == "" || input.NodeID == "" {
|
|
return ClusterNode{}, ErrInvalidPayload
|
|
}
|
|
for _, role := range input.Roles {
|
|
if !isAllowedNodeRole(role) {
|
|
return ClusterNode{}, ErrInvalidNodeRole
|
|
}
|
|
}
|
|
item, err := s.store.AttachExistingNodeToCluster(ctx, input)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return ClusterNode{}, ErrInvalidPayload
|
|
}
|
|
return item, err
|
|
}
|
|
|
|
func (s *Service) AssignNodeToGroup(ctx context.Context, input AssignNodeGroupInput) (ClusterNode, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return ClusterNode{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return ClusterNode{}, err
|
|
}
|
|
if input.ClusterID == "" || input.NodeID == "" {
|
|
return ClusterNode{}, ErrInvalidPayload
|
|
}
|
|
if input.GroupID != nil {
|
|
trimmed := strings.TrimSpace(*input.GroupID)
|
|
if trimmed == "" {
|
|
input.GroupID = nil
|
|
} else {
|
|
input.GroupID = &trimmed
|
|
}
|
|
}
|
|
item, err := s.store.AssignNodeToGroup(ctx, input)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return ClusterNode{}, ErrInvalidPayload
|
|
}
|
|
return item, err
|
|
}
|
|
|
|
func (s *Service) RevokeNodeIdentity(ctx context.Context, input RevokeNodeIdentityInput) error {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return err
|
|
}
|
|
input.Reason = strings.TrimSpace(input.Reason)
|
|
if input.Reason == "" {
|
|
input.Reason = "revoked by platform administrator"
|
|
}
|
|
if err := s.store.RevokeNodeIdentity(ctx, input); err != nil {
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return ErrInvalidPayload
|
|
}
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Service) DisableClusterMembership(ctx context.Context, input DisableMembershipInput) error {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return err
|
|
}
|
|
input.Reason = strings.TrimSpace(input.Reason)
|
|
if input.Reason == "" {
|
|
input.Reason = "disabled by platform administrator"
|
|
}
|
|
if err := s.store.DisableClusterMembership(ctx, input); err != nil {
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return ErrInvalidPayload
|
|
}
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Service) DeleteClusterNode(ctx context.Context, input DeleteClusterNodeInput) error {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return err
|
|
}
|
|
input.Reason = strings.TrimSpace(input.Reason)
|
|
if input.ClusterID == "" || input.NodeID == "" {
|
|
return ErrInvalidPayload
|
|
}
|
|
if input.Reason == "" {
|
|
input.Reason = "deleted by platform administrator"
|
|
}
|
|
if err := s.store.DeleteClusterNode(ctx, input); err != nil {
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return ErrInvalidPayload
|
|
}
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Service) RecordHeartbeat(ctx context.Context, input RecordHeartbeatInput) (NodeHeartbeat, error) {
|
|
if input.ClusterID == "" || input.NodeID == "" {
|
|
return NodeHeartbeat{}, ErrInvalidPayload
|
|
}
|
|
if input.HealthStatus == "" {
|
|
input.HealthStatus = "unknown"
|
|
}
|
|
input.Capabilities = defaultJSON(input.Capabilities, `{}`)
|
|
input.ServiceStates = defaultJSON(input.ServiceStates, `{}`)
|
|
input.Metadata = defaultJSON(input.Metadata, `{}`)
|
|
heartbeat, err := s.store.RecordHeartbeat(ctx, input)
|
|
if err != nil {
|
|
return NodeHeartbeat{}, err
|
|
}
|
|
_ = s.recordFabricServiceChannelRouteFeedback(ctx, heartbeat)
|
|
_ = s.autoWarmFabricServiceChannelRouteRebuildSnapshotsAfterHeartbeat(ctx, heartbeat)
|
|
return heartbeat, nil
|
|
}
|
|
|
|
func (s *Service) ListNodeHeartbeats(ctx context.Context, actorUserID, clusterID, nodeID string, limit int) ([]NodeHeartbeat, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, limit)
|
|
}
|
|
|
|
func (s *Service) ListFabricServiceChannelRouteFeedback(ctx context.Context, actorUserID string, input ListFabricServiceChannelRouteFeedbackInput) ([]FabricServiceChannelRouteFeedbackObservation, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.ReporterNodeID = strings.TrimSpace(input.ReporterNodeID)
|
|
input.RouteID = strings.TrimSpace(input.RouteID)
|
|
input.ServiceClass = strings.TrimSpace(input.ServiceClass)
|
|
input.FeedbackStatus = strings.TrimSpace(input.FeedbackStatus)
|
|
if input.ClusterID == "" {
|
|
return nil, ErrInvalidPayload
|
|
}
|
|
if input.Now.IsZero() {
|
|
input.Now = s.now()
|
|
}
|
|
observations, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, input)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
policy := s.fabricServiceChannelRecoveryPolicy(ctx, input.ClusterID)
|
|
intents, err := s.store.ListRouteIntents(ctx, input.ClusterID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
report := serviceChannelRouteFeedbackReportWithPolicyAndProvenance(observations, input.Now, policy, fabricServiceChannelRouteProvenanceFromIntents(intents))
|
|
if report == nil {
|
|
return nil, nil
|
|
}
|
|
return report.Observations, nil
|
|
}
|
|
|
|
func (s *Service) ListFabricServiceChannelRouteRebuildAttempts(ctx context.Context, actorUserID string, input ListFabricServiceChannelRouteRebuildAttemptsInput) ([]FabricServiceChannelRouteRebuildAttempt, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.ReporterNodeID = strings.TrimSpace(input.ReporterNodeID)
|
|
input.RouteID = strings.TrimSpace(input.RouteID)
|
|
input.ReplacementRouteID = strings.TrimSpace(input.ReplacementRouteID)
|
|
input.ServiceClass = strings.TrimSpace(input.ServiceClass)
|
|
input.RebuildStatus = strings.TrimSpace(input.RebuildStatus)
|
|
input.RebuildRequestID = strings.TrimSpace(input.RebuildRequestID)
|
|
input.Generation = strings.TrimSpace(input.Generation)
|
|
input.FeedbackSource = strings.TrimSpace(input.FeedbackSource)
|
|
input.FeedbackChannelID = strings.TrimSpace(input.FeedbackChannelID)
|
|
input.FeedbackViolationStatus = strings.TrimSpace(input.FeedbackViolationStatus)
|
|
input.EnrichmentMode = strings.TrimSpace(input.EnrichmentMode)
|
|
if input.ClusterID == "" {
|
|
return nil, ErrInvalidPayload
|
|
}
|
|
if input.Offset < 0 {
|
|
input.Offset = 0
|
|
}
|
|
if input.EnrichmentMode == "" {
|
|
input.EnrichmentMode = "summary"
|
|
}
|
|
items, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, input)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if input.EnrichmentMode != "deep" {
|
|
return stripFabricServiceChannelRouteRebuildCorrelation(items), nil
|
|
}
|
|
return s.enrichFabricServiceChannelRouteRebuildAttempts(ctx, input.ClusterID, items, s.now()), nil
|
|
}
|
|
|
|
func (s *Service) GetFabricServiceChannelRouteRebuildHealthSummary(ctx context.Context, actorUserID string, input GetFabricServiceChannelRouteRebuildHealthSummaryInput) (FabricServiceChannelRouteRebuildHealthSummary, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return FabricServiceChannelRouteRebuildHealthSummary{}, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
if input.ClusterID == "" {
|
|
return FabricServiceChannelRouteRebuildHealthSummary{}, ErrInvalidPayload
|
|
}
|
|
if input.Limit <= 0 || input.Limit > 500 {
|
|
input.Limit = 200
|
|
}
|
|
now := s.now()
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
items, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{
|
|
ClusterID: input.ClusterID,
|
|
Limit: input.Limit,
|
|
UseCachedSnapshot: true,
|
|
})
|
|
if err != nil {
|
|
return FabricServiceChannelRouteRebuildHealthSummary{}, err
|
|
}
|
|
items = s.enrichFabricServiceChannelRouteRebuildAttempts(ctx, input.ClusterID, items, now)
|
|
silences, err := s.store.ListFabricServiceChannelRouteRebuildAlertSilences(ctx, input.ClusterID, now)
|
|
if err != nil {
|
|
return FabricServiceChannelRouteRebuildHealthSummary{}, err
|
|
}
|
|
items = applyFabricServiceChannelRouteRebuildAlertSilences(items, silences)
|
|
summary := FabricServiceChannelRouteRebuildHealthSummary{
|
|
ClusterID: input.ClusterID,
|
|
ObservedAt: now.UTC(),
|
|
WindowLimit: input.Limit,
|
|
TotalAttempts: len(items),
|
|
CountsByGuardStatus: map[string]int{},
|
|
CountsByGuardSeverity: map[string]int{},
|
|
}
|
|
affectedNodes := map[string]struct{}{}
|
|
affectedRoutes := map[string]struct{}{}
|
|
feedbackBreakdowns := map[string]*fabricServiceChannelRebuildFeedbackBreakdownAccumulator{}
|
|
for _, item := range items {
|
|
severity := firstNonEmptyString(item.GuardSeverity, "unknown")
|
|
status := firstNonEmptyString(item.GuardStatus, "unknown")
|
|
summary.CountsByGuardSeverity[severity]++
|
|
summary.CountsByGuardStatus[status]++
|
|
switch severity {
|
|
case "good":
|
|
summary.GoodCount++
|
|
case "warn":
|
|
summary.WarnCount++
|
|
if !item.AlertSilenced {
|
|
summary.ActiveWarnCount++
|
|
}
|
|
case "bad":
|
|
summary.BadCount++
|
|
if !item.AlertSilenced {
|
|
summary.ActiveBadCount++
|
|
}
|
|
default:
|
|
summary.UnknownCount++
|
|
}
|
|
if item.AlertSilenced {
|
|
summary.SilencedCount++
|
|
}
|
|
if item.AlertResurfaced {
|
|
summary.ResurfacedCount++
|
|
}
|
|
if item.RebuildStatus == "applied" {
|
|
summary.AppliedCount++
|
|
} else if item.RebuildStatus != "" {
|
|
summary.PendingCount++
|
|
}
|
|
if (severity == "bad" || severity == "warn") && !item.AlertSilenced {
|
|
if item.ReporterNodeID != "" {
|
|
affectedNodes[item.ReporterNodeID] = struct{}{}
|
|
}
|
|
if item.RouteID != "" {
|
|
affectedRoutes[item.RouteID] = struct{}{}
|
|
}
|
|
}
|
|
if severity == "bad" && !item.AlertSilenced && len(summary.MostRecentBadAttempts) < 10 {
|
|
summary.MostRecentBadAttempts = append(summary.MostRecentBadAttempts, item)
|
|
}
|
|
if item.AlertResurfaced && len(summary.ResurfacedAttempts) < 10 {
|
|
summary.ResurfacedAttempts = append(summary.ResurfacedAttempts, item)
|
|
}
|
|
addFabricServiceChannelRebuildFeedbackBreakdown(feedbackBreakdowns, item, severity)
|
|
}
|
|
if accessTelemetry, err := s.GetFabricServiceChannelAccessTelemetry(ctx, actorUserID, GetFabricServiceChannelAccessTelemetryInput{
|
|
ClusterID: input.ClusterID,
|
|
Limit: input.Limit,
|
|
Now: now,
|
|
}); err == nil {
|
|
summary.AccessRouteDecisionCount = accessTelemetry.RouteDecisionChannelCount
|
|
summary.AccessReplacementCount = accessTelemetry.ReplacementDecisionCount
|
|
summary.AccessAppliedCount = accessTelemetry.AppliedRebuildDecisionCount
|
|
summary.AccessRecoveryCount = accessTelemetry.RecoveryDecisionCount
|
|
summary.AccessNoSafeCount = accessTelemetry.NoSafeRecoveryDecisionCount
|
|
accessIncidents := append(
|
|
fabricServiceChannelAccessDecisionIncidents(input.ClusterID, accessTelemetry),
|
|
fabricServiceChannelDataPlaneContractIncidents(input.ClusterID, accessTelemetry)...,
|
|
)
|
|
for _, incident := range applyFabricServiceChannelAccessDecisionIncidentSilences(accessIncidents, silences) {
|
|
summary.CountsByGuardStatus[incident.GuardStatus]++
|
|
summary.CountsByGuardSeverity[incident.GuardSeverity]++
|
|
if incident.AlertSilenced {
|
|
summary.SilencedCount++
|
|
}
|
|
if incident.AlertResurfaced {
|
|
summary.ResurfacedCount++
|
|
}
|
|
switch incident.GuardSeverity {
|
|
case "good":
|
|
summary.GoodCount++
|
|
case "warn":
|
|
summary.WarnCount++
|
|
if !incident.AlertSilenced {
|
|
summary.ActiveWarnCount++
|
|
}
|
|
case "bad":
|
|
summary.BadCount++
|
|
if !incident.AlertSilenced {
|
|
summary.ActiveBadCount++
|
|
}
|
|
default:
|
|
summary.UnknownCount++
|
|
}
|
|
if (incident.GuardSeverity == "bad" || incident.GuardSeverity == "warn") && !incident.AlertSilenced {
|
|
if incident.ReporterNodeID != "" {
|
|
affectedNodes[incident.ReporterNodeID] = struct{}{}
|
|
}
|
|
if incident.RouteID != "" {
|
|
affectedRoutes[incident.RouteID] = struct{}{}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
summary.AffectedReporterNodeIDs = sortedStringSetKeys(affectedNodes)
|
|
summary.AffectedRouteIDs = sortedStringSetKeys(affectedRoutes)
|
|
summary.FeedbackBreakdowns = sortedFabricServiceChannelRebuildFeedbackBreakdowns(feedbackBreakdowns)
|
|
summary.RecommendedOperatorAction = fabricServiceChannelRebuildRecommendedAction(summary)
|
|
return summary, nil
|
|
}
|
|
|
|
type fabricServiceChannelRebuildFeedbackBreakdownAccumulator struct {
|
|
item FabricServiceChannelRouteRebuildFeedbackHealthBreakdown
|
|
nodes map[string]struct{}
|
|
routes map[string]struct{}
|
|
}
|
|
|
|
func addFabricServiceChannelRebuildFeedbackBreakdown(out map[string]*fabricServiceChannelRebuildFeedbackBreakdownAccumulator, attempt FabricServiceChannelRouteRebuildAttempt, severity string) {
|
|
payload := jsonObject(attempt.Payload)
|
|
source := firstNonEmptyString(attempt.FeedbackSource, jsonString(payload, "feedback_source"))
|
|
channelID := firstNonEmptyString(attempt.FeedbackChannelID, jsonString(payload, "feedback_channel_id"))
|
|
violationStatus := firstNonEmptyString(attempt.FeedbackViolationStatus, jsonString(payload, "feedback_violation_status"))
|
|
if source == "" && channelID == "" && violationStatus == "" {
|
|
return
|
|
}
|
|
key := source + "\x00" + channelID + "\x00" + violationStatus
|
|
acc := out[key]
|
|
if acc == nil {
|
|
acc = &fabricServiceChannelRebuildFeedbackBreakdownAccumulator{
|
|
item: FabricServiceChannelRouteRebuildFeedbackHealthBreakdown{
|
|
FeedbackSource: source,
|
|
FeedbackChannelID: channelID,
|
|
FeedbackViolationStatus: violationStatus,
|
|
},
|
|
nodes: map[string]struct{}{},
|
|
routes: map[string]struct{}{},
|
|
}
|
|
out[key] = acc
|
|
}
|
|
acc.item.TotalCount++
|
|
switch severity {
|
|
case "good":
|
|
acc.item.GoodCount++
|
|
case "warn":
|
|
acc.item.WarnCount++
|
|
if !attempt.AlertSilenced {
|
|
acc.item.ActiveWarnCount++
|
|
}
|
|
case "bad":
|
|
acc.item.BadCount++
|
|
if !attempt.AlertSilenced {
|
|
acc.item.ActiveBadCount++
|
|
}
|
|
default:
|
|
acc.item.UnknownCount++
|
|
}
|
|
if attempt.AlertSilenced {
|
|
acc.item.SilencedCount++
|
|
}
|
|
observedAt := time.Time{}
|
|
if attempt.FeedbackObservedAt != nil {
|
|
observedAt = attempt.FeedbackObservedAt.UTC()
|
|
} else if value := strings.TrimSpace(jsonString(payload, "feedback_observed_at")); value != "" {
|
|
if parsed, err := time.Parse(time.RFC3339Nano, value); err == nil {
|
|
observedAt = parsed.UTC()
|
|
}
|
|
}
|
|
if observedAt.IsZero() {
|
|
observedAt = attempt.UpdatedAt.UTC()
|
|
}
|
|
if observedAt.After(acc.item.LatestObservedAt) {
|
|
acc.item.LatestObservedAt = observedAt
|
|
}
|
|
if attempt.ReporterNodeID != "" {
|
|
acc.nodes[attempt.ReporterNodeID] = struct{}{}
|
|
}
|
|
if attempt.RouteID != "" {
|
|
acc.routes[attempt.RouteID] = struct{}{}
|
|
}
|
|
}
|
|
|
|
func sortedFabricServiceChannelRebuildFeedbackBreakdowns(input map[string]*fabricServiceChannelRebuildFeedbackBreakdownAccumulator) []FabricServiceChannelRouteRebuildFeedbackHealthBreakdown {
|
|
out := make([]FabricServiceChannelRouteRebuildFeedbackHealthBreakdown, 0, len(input))
|
|
for _, acc := range input {
|
|
item := acc.item
|
|
item.AffectedReporterNodeIDs = sortedStringSetKeys(acc.nodes)
|
|
item.AffectedRouteIDs = sortedStringSetKeys(acc.routes)
|
|
out = append(out, item)
|
|
}
|
|
sort.SliceStable(out, func(i, j int) bool {
|
|
leftActive := out[i].ActiveBadCount*100000 + out[i].ActiveWarnCount*1000 + out[i].TotalCount
|
|
rightActive := out[j].ActiveBadCount*100000 + out[j].ActiveWarnCount*1000 + out[j].TotalCount
|
|
if leftActive != rightActive {
|
|
return leftActive > rightActive
|
|
}
|
|
if !out[i].LatestObservedAt.Equal(out[j].LatestObservedAt) {
|
|
return out[i].LatestObservedAt.After(out[j].LatestObservedAt)
|
|
}
|
|
left := out[i].FeedbackSource + out[i].FeedbackChannelID + out[i].FeedbackViolationStatus
|
|
right := out[j].FeedbackSource + out[j].FeedbackChannelID + out[j].FeedbackViolationStatus
|
|
return left < right
|
|
})
|
|
if len(out) > 100 {
|
|
out = out[:100]
|
|
}
|
|
return out
|
|
}
|
|
|
|
func (s *Service) GetFabricServiceChannelReadiness(ctx context.Context, actorUserID string, input GetFabricServiceChannelReadinessInput) (FabricServiceChannelReadiness, error) {
|
|
if input.Limit <= 0 || input.Limit > 5 {
|
|
input.Limit = 5
|
|
}
|
|
summary, err := s.GetFabricServiceChannelRouteRebuildHealthSummary(ctx, actorUserID, GetFabricServiceChannelRouteRebuildHealthSummaryInput{
|
|
ClusterID: input.ClusterID,
|
|
Limit: input.Limit,
|
|
})
|
|
if err != nil {
|
|
return FabricServiceChannelReadiness{}, err
|
|
}
|
|
return fabricServiceChannelReadinessFromRebuildHealth(summary), nil
|
|
}
|
|
|
|
func (s *Service) GetFabricServiceChannelSchemaStatus(ctx context.Context, actorUserID string, input GetFabricServiceChannelSchemaStatusInput) (FabricServiceChannelSchemaStatus, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return FabricServiceChannelSchemaStatus{}, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
if input.ClusterID == "" {
|
|
return FabricServiceChannelSchemaStatus{}, ErrInvalidPayload
|
|
}
|
|
return s.store.GetFabricServiceChannelSchemaStatus(ctx, input)
|
|
}
|
|
|
|
func (s *Service) GetFabricServiceChannelRebuildSnapshotMaintenanceHealth(ctx context.Context, actorUserID string, input GetFabricServiceChannelRebuildSnapshotMaintenanceHealthInput) (FabricServiceChannelRebuildSnapshotMaintenanceHealth, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return FabricServiceChannelRebuildSnapshotMaintenanceHealth{}, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
if input.ClusterID == "" {
|
|
return FabricServiceChannelRebuildSnapshotMaintenanceHealth{}, ErrInvalidPayload
|
|
}
|
|
if input.Limit <= 0 {
|
|
input.Limit = 50
|
|
}
|
|
if input.Limit > 100 {
|
|
input.Limit = 100
|
|
}
|
|
if input.MinAgeSeconds <= 0 {
|
|
input.MinAgeSeconds = 60
|
|
}
|
|
if input.MinAgeSeconds > 3600 {
|
|
input.MinAgeSeconds = 3600
|
|
}
|
|
if input.HeartbeatThreshold <= 0 {
|
|
input.HeartbeatThreshold = 2
|
|
}
|
|
if input.HeartbeatThreshold > 10 {
|
|
input.HeartbeatThreshold = 10
|
|
}
|
|
now := s.now()
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
out := FabricServiceChannelRebuildSnapshotMaintenanceHealth{
|
|
ClusterID: input.ClusterID,
|
|
ObservedAt: now.UTC(),
|
|
Status: "ready",
|
|
Reason: "snapshot_maintenance_ready",
|
|
WindowLimit: input.Limit,
|
|
MinAgeSeconds: input.MinAgeSeconds,
|
|
HeartbeatThreshold: input.HeartbeatThreshold,
|
|
}
|
|
attempts, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{
|
|
ClusterID: input.ClusterID,
|
|
Limit: input.Limit,
|
|
})
|
|
if err != nil {
|
|
return FabricServiceChannelRebuildSnapshotMaintenanceHealth{}, err
|
|
}
|
|
heartbeatsByNode := map[string][]NodeHeartbeat{}
|
|
nodes := map[string]*FabricServiceChannelRebuildSnapshotNodeHealth{}
|
|
nodeHealth := func(nodeID string) *FabricServiceChannelRebuildSnapshotNodeHealth {
|
|
nodeID = strings.TrimSpace(nodeID)
|
|
if nodeID == "" {
|
|
nodeID = "unknown"
|
|
}
|
|
if item, ok := nodes[nodeID]; ok {
|
|
return item
|
|
}
|
|
item := &FabricServiceChannelRebuildSnapshotNodeHealth{NodeID: nodeID}
|
|
nodes[nodeID] = item
|
|
return item
|
|
}
|
|
for _, attempt := range attempts {
|
|
out.RecentAttemptCount++
|
|
node := nodeHealth(attempt.ReporterNodeID)
|
|
node.RecentAttemptCount++
|
|
if fabricServiceChannelRouteRebuildHasCorrelationSnapshot(attempt) {
|
|
out.ValidSnapshotCount++
|
|
node.ValidSnapshotCount++
|
|
continue
|
|
}
|
|
out.MissingSnapshotCount++
|
|
node.MissingSnapshotCount++
|
|
ageSeconds := int64(now.Sub(attempt.UpdatedAt).Seconds())
|
|
if ageSeconds < input.MinAgeSeconds {
|
|
continue
|
|
}
|
|
reporterNodeID := strings.TrimSpace(attempt.ReporterNodeID)
|
|
if reporterNodeID == "" {
|
|
continue
|
|
}
|
|
heartbeats, ok := heartbeatsByNode[reporterNodeID]
|
|
if !ok {
|
|
heartbeats, err = s.store.ListNodeHeartbeats(ctx, input.ClusterID, reporterNodeID, input.HeartbeatThreshold+5)
|
|
if err != nil {
|
|
heartbeats = nil
|
|
}
|
|
heartbeatsByNode[reporterNodeID] = heartbeats
|
|
}
|
|
heartbeatAfterAttemptCount := 0
|
|
for _, heartbeat := range heartbeats {
|
|
observedAt := heartbeat.ObservedAt
|
|
if node.LastHeartbeatAt == nil || observedAt.After(*node.LastHeartbeatAt) {
|
|
value := observedAt
|
|
node.LastHeartbeatAt = &value
|
|
}
|
|
if observedAt.After(attempt.UpdatedAt) || observedAt.Equal(attempt.UpdatedAt) {
|
|
heartbeatAfterAttemptCount++
|
|
}
|
|
}
|
|
if heartbeatAfterAttemptCount > node.HeartbeatAfterAttemptCount {
|
|
node.HeartbeatAfterAttemptCount = heartbeatAfterAttemptCount
|
|
}
|
|
if heartbeatAfterAttemptCount >= input.HeartbeatThreshold {
|
|
out.OverdueMissingSnapshotCount++
|
|
node.OverdueMissingSnapshotCount++
|
|
if len(out.OverdueMissingSnapshotAttempts) < 10 {
|
|
out.OverdueMissingSnapshotAttempts = append(out.OverdueMissingSnapshotAttempts, attempt)
|
|
}
|
|
}
|
|
}
|
|
events, err := s.store.ListAuditEvents(ctx, ListAuditEventsInput{
|
|
ClusterID: input.ClusterID,
|
|
EventTypes: []string{"fabric.service_channel_rebuild_snapshot.auto_warmup"},
|
|
Limit: 100,
|
|
})
|
|
if err != nil {
|
|
return FabricServiceChannelRebuildSnapshotMaintenanceHealth{}, err
|
|
}
|
|
for _, event := range events {
|
|
if event.EventType != "fabric.service_channel_rebuild_snapshot.auto_warmup" {
|
|
continue
|
|
}
|
|
payload := jsonObject(event.Payload)
|
|
nodeID := jsonString(payload, "reporter_node_id")
|
|
node := nodeHealth(nodeID)
|
|
out.AutoWarmupEventCount++
|
|
out.AutoWarmupWarmedCount += jsonInt(payload, "warmed_count")
|
|
out.AutoWarmupAlreadyFreshCount += jsonInt(payload, "already_fresh_count")
|
|
out.AutoWarmupErrorCount += jsonInt(payload, "error_count")
|
|
node.AutoWarmupEventCount++
|
|
node.AutoWarmupWarmedCount += jsonInt(payload, "warmed_count")
|
|
node.AutoWarmupErrorCount += jsonInt(payload, "error_count")
|
|
createdAt := event.CreatedAt
|
|
if out.LatestAutoWarmupAt == nil || createdAt.After(*out.LatestAutoWarmupAt) {
|
|
value := createdAt
|
|
out.LatestAutoWarmupAt = &value
|
|
}
|
|
if node.LatestAutoWarmupAt == nil || createdAt.After(*node.LatestAutoWarmupAt) {
|
|
value := createdAt
|
|
node.LatestAutoWarmupAt = &value
|
|
}
|
|
}
|
|
out.Nodes = make([]FabricServiceChannelRebuildSnapshotNodeHealth, 0, len(nodes))
|
|
for _, item := range nodes {
|
|
out.Nodes = append(out.Nodes, *item)
|
|
}
|
|
sort.Slice(out.Nodes, func(i, j int) bool {
|
|
if out.Nodes[i].OverdueMissingSnapshotCount != out.Nodes[j].OverdueMissingSnapshotCount {
|
|
return out.Nodes[i].OverdueMissingSnapshotCount > out.Nodes[j].OverdueMissingSnapshotCount
|
|
}
|
|
if out.Nodes[i].MissingSnapshotCount != out.Nodes[j].MissingSnapshotCount {
|
|
return out.Nodes[i].MissingSnapshotCount > out.Nodes[j].MissingSnapshotCount
|
|
}
|
|
return out.Nodes[i].NodeID < out.Nodes[j].NodeID
|
|
})
|
|
if out.AutoWarmupErrorCount > 0 {
|
|
out.Status = "degraded"
|
|
out.Reason = "auto_warmup_errors_seen"
|
|
out.RecommendedOperatorAction = "Check backend logs and heartbeat metadata for nodes with auto-warmup errors."
|
|
}
|
|
if out.OverdueMissingSnapshotCount > 0 {
|
|
out.Status = "degraded"
|
|
out.Reason = "snapshot_warmup_overdue"
|
|
out.RecommendedOperatorAction = "Run warm snapshots or inspect reporter nodes whose heartbeat evidence is not producing rebuild snapshots."
|
|
}
|
|
if out.MissingSnapshotCount > 0 && out.OverdueMissingSnapshotCount == 0 && out.RecommendedOperatorAction == "" {
|
|
out.RecommendedOperatorAction = "Recent attempts are still waiting for runtime heartbeat evidence."
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func (s *Service) WarmupFabricServiceChannelRebuildSnapshots(ctx context.Context, input WarmupFabricServiceChannelRebuildSnapshotsInput) (FabricServiceChannelRebuildSnapshotWarmup, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return FabricServiceChannelRebuildSnapshotWarmup{}, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
if input.ClusterID == "" {
|
|
return FabricServiceChannelRebuildSnapshotWarmup{}, ErrInvalidPayload
|
|
}
|
|
if input.Limit <= 0 || input.Limit > 50 {
|
|
input.Limit = 10
|
|
}
|
|
if input.StaleAfterSeconds <= 0 || input.StaleAfterSeconds > int64((24*time.Hour).Seconds()) {
|
|
input.StaleAfterSeconds = 60
|
|
}
|
|
now := input.Now
|
|
if now.IsZero() {
|
|
now = s.now()
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
result := FabricServiceChannelRebuildSnapshotWarmup{
|
|
ClusterID: input.ClusterID,
|
|
ObservedAt: now.UTC(),
|
|
WindowLimit: input.Limit,
|
|
StaleAfterSeconds: input.StaleAfterSeconds,
|
|
Status: "ready",
|
|
Reason: "snapshots_warmed",
|
|
}
|
|
items, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{
|
|
ClusterID: input.ClusterID,
|
|
Limit: input.Limit,
|
|
})
|
|
if err != nil {
|
|
return FabricServiceChannelRebuildSnapshotWarmup{}, err
|
|
}
|
|
result.ScannedCount = len(items)
|
|
heartbeatsByNode := map[string][]NodeHeartbeat{}
|
|
staleAfter := time.Duration(input.StaleAfterSeconds) * time.Second
|
|
for _, item := range items {
|
|
if !fabricServiceChannelRouteRebuildHasCorrelationSnapshot(item) {
|
|
result.MissingSnapshotCount++
|
|
} else if fabricServiceChannelRouteRebuildSnapshotIsStale(item, now, staleAfter) {
|
|
result.StaleSnapshotCount++
|
|
result.DeferredStaleCount++
|
|
continue
|
|
} else {
|
|
result.AlreadyFreshCount++
|
|
continue
|
|
}
|
|
nodeID := strings.TrimSpace(item.ReporterNodeID)
|
|
if nodeID == "" {
|
|
result.ErrorCount++
|
|
continue
|
|
}
|
|
if _, ok := heartbeatsByNode[nodeID]; !ok {
|
|
heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, nodeID, 120)
|
|
if err != nil {
|
|
result.ErrorCount++
|
|
heartbeats = nil
|
|
}
|
|
heartbeatsByNode[nodeID] = heartbeats
|
|
}
|
|
item = enrichFabricServiceChannelRouteRebuildAttempt(item, heartbeatsByNode[nodeID], now)
|
|
item.CorrelationSnapshotAt = &now
|
|
if err := s.store.UpdateFabricServiceChannelRouteRebuildCorrelationSnapshot(ctx, fabricServiceChannelRouteRebuildCorrelationSnapshotInput(item, now)); err != nil {
|
|
result.ErrorCount++
|
|
continue
|
|
}
|
|
result.WarmedCount++
|
|
}
|
|
if result.ErrorCount > 0 {
|
|
result.Status = "degraded"
|
|
result.Reason = "snapshot_warmup_partial"
|
|
result.RecommendedOperatorAction = "Check node heartbeat history and backend logs for rebuild snapshot warmup failures."
|
|
} else if result.DeferredStaleCount > 0 {
|
|
result.Status = "ready"
|
|
result.Reason = "missing_snapshots_warmed_stale_deferred"
|
|
result.RecommendedOperatorAction = "Stale snapshots were detected and left cached; age-sensitive guard state is recomputed on read."
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func (s *Service) ListFabricServiceChannelRouteRebuildIncidents(ctx context.Context, actorUserID string, input ListFabricServiceChannelRouteRebuildIncidentsInput) ([]FabricServiceChannelRouteRebuildIncident, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
if input.ClusterID == "" {
|
|
return nil, ErrInvalidPayload
|
|
}
|
|
if input.Limit <= 0 || input.Limit > 5 {
|
|
input.Limit = 5
|
|
}
|
|
now := s.now()
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
items, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{
|
|
ClusterID: input.ClusterID,
|
|
Limit: input.Limit,
|
|
UseCachedSnapshot: true,
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
items = s.enrichFabricServiceChannelRouteRebuildAttempts(ctx, input.ClusterID, items, now)
|
|
silences, err := s.store.ListFabricServiceChannelRouteRebuildAlertSilences(ctx, input.ClusterID, now)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
items = applyFabricServiceChannelRouteRebuildAlertSilences(items, silences)
|
|
incidents := fabricServiceChannelRouteRebuildIncidentsFromAttempts(input.ClusterID, items)
|
|
if accessTelemetry, err := s.GetFabricServiceChannelAccessTelemetry(ctx, actorUserID, GetFabricServiceChannelAccessTelemetryInput{
|
|
ClusterID: input.ClusterID,
|
|
Limit: input.Limit,
|
|
Now: now,
|
|
}); err == nil {
|
|
accessIncidents := append(
|
|
fabricServiceChannelAccessDecisionIncidents(input.ClusterID, accessTelemetry),
|
|
fabricServiceChannelDataPlaneContractIncidents(input.ClusterID, accessTelemetry)...,
|
|
)
|
|
incidents = append(incidents, applyFabricServiceChannelAccessDecisionIncidentSilences(accessIncidents, silences)...)
|
|
fabricServiceChannelSortRouteRebuildIncidents(incidents)
|
|
}
|
|
if len(incidents) > input.Limit {
|
|
incidents = incidents[:input.Limit]
|
|
}
|
|
return incidents, nil
|
|
}
|
|
|
|
func (s *Service) RecordFabricServiceChannelRouteRebuildInvestigation(ctx context.Context, input RecordFabricServiceChannelRouteRebuildInvestigationInput) error {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.ReporterNodeID = strings.TrimSpace(input.ReporterNodeID)
|
|
input.RouteID = strings.TrimSpace(input.RouteID)
|
|
input.ServiceClass = strings.TrimSpace(input.ServiceClass)
|
|
input.Generation = strings.TrimSpace(input.Generation)
|
|
input.GuardStatus = strings.TrimSpace(input.GuardStatus)
|
|
input.IncidentID = strings.TrimSpace(input.IncidentID)
|
|
input.FeedbackSource = strings.TrimSpace(input.FeedbackSource)
|
|
input.FeedbackChannelID = strings.TrimSpace(input.FeedbackChannelID)
|
|
input.FeedbackViolationStatus = strings.TrimSpace(input.FeedbackViolationStatus)
|
|
input.DrilldownSource = strings.TrimSpace(input.DrilldownSource)
|
|
input.Reason = strings.TrimSpace(input.Reason)
|
|
if input.ClusterID == "" || (input.ReporterNodeID == "" && input.RouteID == "" && input.FeedbackSource == "" && input.FeedbackChannelID == "" && input.FeedbackViolationStatus == "") {
|
|
return ErrInvalidPayload
|
|
}
|
|
now := input.Now
|
|
if now.IsZero() {
|
|
now = s.now()
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
eventType := "fabric.service_channel_rebuild_incident.investigation_opened"
|
|
targetType := "fabric_service_channel_route_rebuild_incident"
|
|
targetIDValue := firstNonEmptyString(input.RouteID, input.FeedbackChannelID, input.FeedbackViolationStatus, input.FeedbackSource, input.ReporterNodeID)
|
|
if input.DrilldownSource == "rebuild_health_feedback_breakdown" || input.FeedbackSource != "" || input.FeedbackChannelID != "" || input.FeedbackViolationStatus != "" {
|
|
eventType = "fabric.service_channel_rebuild_feedback_breakdown.investigation_opened"
|
|
targetType = "fabric_service_channel_rebuild_feedback_breakdown"
|
|
}
|
|
return s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: eventType,
|
|
TargetType: targetType,
|
|
TargetID: &targetIDValue,
|
|
Payload: mustJSONRaw(map[string]any{
|
|
"incident_id": input.IncidentID,
|
|
"reporter_node_id": input.ReporterNodeID,
|
|
"route_id": input.RouteID,
|
|
"service_class": input.ServiceClass,
|
|
"generation": input.Generation,
|
|
"guard_status": input.GuardStatus,
|
|
"feedback_source": input.FeedbackSource,
|
|
"feedback_channel_id": input.FeedbackChannelID,
|
|
"feedback_violation_status": input.FeedbackViolationStatus,
|
|
"drilldown_source": input.DrilldownSource,
|
|
"reason": input.Reason,
|
|
}),
|
|
CreatedAt: now.UTC(),
|
|
})
|
|
}
|
|
|
|
func (s *Service) SilenceFabricServiceChannelRouteRebuildAlert(ctx context.Context, input SilenceFabricServiceChannelRouteRebuildAlertInput) (FabricServiceChannelRouteRebuildAlertSilence, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return FabricServiceChannelRouteRebuildAlertSilence{}, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.ReporterNodeID = strings.TrimSpace(input.ReporterNodeID)
|
|
input.RouteID = strings.TrimSpace(input.RouteID)
|
|
input.GuardStatus = strings.TrimSpace(input.GuardStatus)
|
|
input.Generation = strings.TrimSpace(input.Generation)
|
|
input.Reason = strings.TrimSpace(input.Reason)
|
|
input.IncidentSource = strings.TrimSpace(input.IncidentSource)
|
|
input.ChannelID = strings.TrimSpace(input.ChannelID)
|
|
if input.ClusterID == "" || input.ReporterNodeID == "" || input.RouteID == "" || input.GuardStatus == "" {
|
|
return FabricServiceChannelRouteRebuildAlertSilence{}, ErrInvalidPayload
|
|
}
|
|
requestedRouteID := input.RouteID
|
|
if input.IncidentSource == "access_decision" || input.IncidentSource == "data_plane_contract" {
|
|
if input.ChannelID == "" {
|
|
return FabricServiceChannelRouteRebuildAlertSilence{}, ErrInvalidPayload
|
|
}
|
|
input.RouteID = fabricServiceChannelAccessDecisionSilenceRouteID(input.ChannelID, input.RouteID)
|
|
}
|
|
if input.TTL <= 0 || input.TTL > 7*24*time.Hour {
|
|
input.TTL = 6 * time.Hour
|
|
}
|
|
now := input.Now
|
|
if now.IsZero() {
|
|
now = s.now()
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
expiresAt := now.UTC().Add(input.TTL)
|
|
silence, err := s.store.UpsertFabricServiceChannelRouteRebuildAlertSilence(ctx, input, expiresAt)
|
|
if err != nil {
|
|
return FabricServiceChannelRouteRebuildAlertSilence{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "fabric.service_channel_rebuild_alert.silenced",
|
|
TargetType: "fabric_service_channel_route_rebuild_alert",
|
|
TargetID: &input.RouteID,
|
|
Payload: mustJSONRaw(map[string]any{
|
|
"reporter_node_id": input.ReporterNodeID,
|
|
"route_id": requestedRouteID,
|
|
"stored_route_id": input.RouteID,
|
|
"incident_source": input.IncidentSource,
|
|
"channel_id": input.ChannelID,
|
|
"guard_status": input.GuardStatus,
|
|
"generation": input.Generation,
|
|
"reason": input.Reason,
|
|
"expires_at": expiresAt.UTC().Format(time.RFC3339Nano),
|
|
}),
|
|
CreatedAt: now.UTC(),
|
|
})
|
|
return silence, nil
|
|
}
|
|
|
|
func (s *Service) ListFabricServiceChannelRouteRebuildAlertSilences(ctx context.Context, actorUserID string, clusterID string, now time.Time) ([]FabricServiceChannelRouteRebuildAlertSilence, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
clusterID = strings.TrimSpace(clusterID)
|
|
if clusterID == "" {
|
|
return nil, ErrInvalidPayload
|
|
}
|
|
if now.IsZero() {
|
|
now = s.now()
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
return s.store.ListFabricServiceChannelRouteRebuildAlertSilences(ctx, clusterID, now)
|
|
}
|
|
|
|
func (s *Service) UnsilenceFabricServiceChannelRouteRebuildAlert(ctx context.Context, input UnsilenceFabricServiceChannelRouteRebuildAlertInput) (FabricServiceChannelRouteRebuildAlertSilence, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return FabricServiceChannelRouteRebuildAlertSilence{}, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.SilenceID = strings.TrimSpace(input.SilenceID)
|
|
input.Reason = strings.TrimSpace(input.Reason)
|
|
if input.ClusterID == "" || input.SilenceID == "" {
|
|
return FabricServiceChannelRouteRebuildAlertSilence{}, ErrInvalidPayload
|
|
}
|
|
now := input.Now
|
|
if now.IsZero() {
|
|
now = s.now()
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
silence, err := s.store.DeleteFabricServiceChannelRouteRebuildAlertSilence(ctx, input)
|
|
if err != nil {
|
|
return FabricServiceChannelRouteRebuildAlertSilence{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "fabric.service_channel_rebuild_alert.unsilenced",
|
|
TargetType: "fabric_service_channel_route_rebuild_alert_silence",
|
|
TargetID: &input.SilenceID,
|
|
Payload: mustJSONRaw(map[string]any{
|
|
"reporter_node_id": silence.ReporterNodeID,
|
|
"route_id": silence.DisplayRouteID,
|
|
"stored_route_id": silence.RouteID,
|
|
"incident_source": silence.IncidentSource,
|
|
"channel_id": silence.ChannelID,
|
|
"guard_status": silence.GuardStatus,
|
|
"generation": silence.Generation,
|
|
"reason": input.Reason,
|
|
"unsilenced_at": now.UTC().Format(time.RFC3339Nano),
|
|
}),
|
|
CreatedAt: now.UTC(),
|
|
})
|
|
return silence, nil
|
|
}
|
|
|
|
func (s *Service) enrichFabricServiceChannelRouteRebuildAttempts(ctx context.Context, clusterID string, items []FabricServiceChannelRouteRebuildAttempt, now time.Time) []FabricServiceChannelRouteRebuildAttempt {
|
|
if len(items) == 0 {
|
|
return items
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
heartbeatsByNode := map[string][]NodeHeartbeat{}
|
|
for idx := range items {
|
|
if fabricServiceChannelRouteRebuildHasCorrelationSnapshot(items[idx]) {
|
|
items[idx] = applyFabricServiceChannelRouteRebuildGuard(items[idx], now)
|
|
continue
|
|
}
|
|
nodeID := strings.TrimSpace(items[idx].ReporterNodeID)
|
|
if nodeID == "" {
|
|
continue
|
|
}
|
|
if _, ok := heartbeatsByNode[nodeID]; !ok {
|
|
heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 120)
|
|
if err != nil {
|
|
heartbeats = nil
|
|
}
|
|
heartbeatsByNode[nodeID] = heartbeats
|
|
}
|
|
items[idx] = enrichFabricServiceChannelRouteRebuildAttempt(items[idx], heartbeatsByNode[nodeID], now)
|
|
if fabricServiceChannelRouteRebuildHasRuntimeEvidence(items[idx]) {
|
|
items[idx].CorrelationSnapshotAt = &now
|
|
_ = s.store.UpdateFabricServiceChannelRouteRebuildCorrelationSnapshot(ctx, fabricServiceChannelRouteRebuildCorrelationSnapshotInput(items[idx], now))
|
|
}
|
|
}
|
|
return items
|
|
}
|
|
|
|
func fabricServiceChannelRouteRebuildHasCorrelationSnapshot(item FabricServiceChannelRouteRebuildAttempt) bool {
|
|
return item.CorrelationSnapshotAt != nil && fabricServiceChannelRouteRebuildHasRuntimeEvidence(item)
|
|
}
|
|
|
|
func fabricServiceChannelRouteRebuildHasRuntimeEvidence(item FabricServiceChannelRouteRebuildAttempt) bool {
|
|
return item.NodeTransitionMatched ||
|
|
item.NodeRouteGenerationMatched ||
|
|
item.PostRebuildSelectedRouteID != "" ||
|
|
item.PostRebuildSendPackets > 0 ||
|
|
item.PostRebuildSendFlowPackets > 0
|
|
}
|
|
|
|
func fabricServiceChannelRouteRebuildSnapshotIsStale(item FabricServiceChannelRouteRebuildAttempt, now time.Time, staleAfter time.Duration) bool {
|
|
if item.CorrelationSnapshotAt == nil {
|
|
return true
|
|
}
|
|
if staleAfter <= 0 {
|
|
return false
|
|
}
|
|
snapshotAt := item.CorrelationSnapshotAt.UTC()
|
|
if snapshotAt.IsZero() {
|
|
return true
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
return now.UTC().Sub(snapshotAt) > staleAfter
|
|
}
|
|
|
|
func stripFabricServiceChannelRouteRebuildCorrelation(items []FabricServiceChannelRouteRebuildAttempt) []FabricServiceChannelRouteRebuildAttempt {
|
|
for idx := range items {
|
|
items[idx].NodeTransitionStatus = ""
|
|
items[idx].NodeTransitionGeneration = ""
|
|
items[idx].NodeTransitionObservedAt = ""
|
|
items[idx].NodeTransitionMatched = false
|
|
items[idx].NodeRouteGenerationStatus = ""
|
|
items[idx].NodeRouteGenerationAppliedAt = ""
|
|
items[idx].NodeRouteGenerationWithdrawnAt = ""
|
|
items[idx].NodeRouteGenerationMatched = false
|
|
items[idx].PostRebuildSelectedRouteID = ""
|
|
items[idx].PostRebuildSendPackets = 0
|
|
items[idx].PostRebuildSendFailures = 0
|
|
items[idx].PostRebuildSendFlowPackets = 0
|
|
items[idx].PostRebuildSendFlowDropped = 0
|
|
items[idx].GuardStatus = ""
|
|
items[idx].GuardSeverity = ""
|
|
items[idx].GuardReason = ""
|
|
items[idx].GuardAgeSeconds = 0
|
|
items[idx].GuardTransitionDeadlineSeconds = 0
|
|
items[idx].GuardTrafficDeadlineSeconds = 0
|
|
items[idx].Timeline = nil
|
|
items[idx].CorrelationSnapshotAt = nil
|
|
}
|
|
return items
|
|
}
|
|
|
|
func fabricServiceChannelRouteRebuildCorrelationSnapshotInput(item FabricServiceChannelRouteRebuildAttempt, now time.Time) UpdateFabricServiceChannelRouteRebuildCorrelationSnapshotInput {
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
return UpdateFabricServiceChannelRouteRebuildCorrelationSnapshotInput{
|
|
ID: item.ID,
|
|
NodeTransitionStatus: item.NodeTransitionStatus,
|
|
NodeTransitionGeneration: item.NodeTransitionGeneration,
|
|
NodeTransitionObservedAt: item.NodeTransitionObservedAt,
|
|
NodeTransitionMatched: item.NodeTransitionMatched,
|
|
NodeRouteGenerationStatus: item.NodeRouteGenerationStatus,
|
|
NodeRouteGenerationAppliedAt: item.NodeRouteGenerationAppliedAt,
|
|
NodeRouteGenerationWithdrawnAt: item.NodeRouteGenerationWithdrawnAt,
|
|
NodeRouteGenerationMatched: item.NodeRouteGenerationMatched,
|
|
PostRebuildSelectedRouteID: item.PostRebuildSelectedRouteID,
|
|
PostRebuildSendPackets: item.PostRebuildSendPackets,
|
|
PostRebuildSendFailures: item.PostRebuildSendFailures,
|
|
PostRebuildSendFlowPackets: item.PostRebuildSendFlowPackets,
|
|
PostRebuildSendFlowDropped: item.PostRebuildSendFlowDropped,
|
|
GuardStatus: item.GuardStatus,
|
|
GuardSeverity: item.GuardSeverity,
|
|
GuardReason: item.GuardReason,
|
|
GuardTransitionDeadlineSeconds: item.GuardTransitionDeadlineSeconds,
|
|
GuardTrafficDeadlineSeconds: item.GuardTrafficDeadlineSeconds,
|
|
Timeline: item.Timeline,
|
|
CorrelationSnapshotAt: now.UTC(),
|
|
}
|
|
}
|
|
|
|
func enrichFabricServiceChannelRouteRebuildAttempt(item FabricServiceChannelRouteRebuildAttempt, heartbeats []NodeHeartbeat, now time.Time) FabricServiceChannelRouteRebuildAttempt {
|
|
item.Timeline = append(item.Timeline, FabricServiceChannelRouteRebuildTimelineEvent{
|
|
Stage: "backend_decision",
|
|
Status: firstNonEmptyString(item.RebuildStatus, "unknown"),
|
|
At: item.UpdatedAt.UTC().Format(time.RFC3339Nano),
|
|
RouteID: item.RouteID,
|
|
Generation: item.Generation,
|
|
Payload: mustJSONRaw(map[string]any{
|
|
"rebuild_request_id": item.RebuildRequestID,
|
|
"decision_source": item.DecisionSource,
|
|
"outcome": item.Outcome,
|
|
"replacement_route_id": item.ReplacementRouteID,
|
|
"rebuild_reason": item.RebuildReason,
|
|
}),
|
|
})
|
|
for _, heartbeat := range heartbeats {
|
|
metadata := jsonObject(heartbeat.Metadata)
|
|
runtime := jsonMapPath(metadata, "fabric_service_channel_runtime_report")
|
|
ingress := jsonMapPath(runtime, "ingress")
|
|
transition := jsonMapPath(ingress, "route_manager_transition")
|
|
if !item.NodeTransitionMatched && transitionMatchesRebuildAttempt(transition, item) {
|
|
item.NodeTransitionMatched = true
|
|
item.NodeTransitionStatus = jsonString(transition, "status")
|
|
item.NodeTransitionGeneration = jsonString(transition, "generation")
|
|
item.NodeTransitionObservedAt = firstNonEmptyString(jsonString(transition, "observed_at"), heartbeat.ObservedAt.UTC().Format(time.RFC3339Nano))
|
|
item.Timeline = append(item.Timeline, FabricServiceChannelRouteRebuildTimelineEvent{
|
|
Stage: "node_route_manager_transition",
|
|
Status: item.NodeTransitionStatus,
|
|
At: item.NodeTransitionObservedAt,
|
|
RouteID: item.RouteID,
|
|
Generation: item.NodeTransitionGeneration,
|
|
Payload: mustJSONRaw(transition),
|
|
})
|
|
}
|
|
routeGeneration := jsonMapPath(metadata, "mesh_route_generation_report")
|
|
if !item.NodeRouteGenerationMatched {
|
|
if decision, ok := routeGenerationDecisionForAttempt(routeGeneration, item); ok {
|
|
item.NodeRouteGenerationMatched = true
|
|
item.NodeRouteGenerationStatus = firstNonEmptyString(jsonString(decision, "status"), jsonString(decision, "apply_status"), jsonString(decision, "withdraw_status"))
|
|
item.NodeRouteGenerationAppliedAt = jsonString(decision, "applied_at")
|
|
item.NodeRouteGenerationWithdrawnAt = jsonString(decision, "withdrawn_at")
|
|
item.Timeline = append(item.Timeline, FabricServiceChannelRouteRebuildTimelineEvent{
|
|
Stage: "node_route_generation_apply",
|
|
Status: item.NodeRouteGenerationStatus,
|
|
At: firstNonEmptyString(item.NodeRouteGenerationAppliedAt, item.NodeRouteGenerationWithdrawnAt, heartbeat.ObservedAt.UTC().Format(time.RFC3339Nano)),
|
|
RouteID: item.RouteID,
|
|
Generation: jsonString(decision, "generation"),
|
|
Payload: mustJSONRaw(decision),
|
|
})
|
|
}
|
|
}
|
|
if item.PostRebuildSelectedRouteID == "" && !heartbeat.ObservedAt.Before(item.UpdatedAt) {
|
|
selectedRouteID := jsonString(ingress, "last_selected_route_id")
|
|
if selectedRouteID == item.ReplacementRouteID || selectedRouteID == item.RouteID || selectedRouteID != "" {
|
|
item.PostRebuildSelectedRouteID = selectedRouteID
|
|
item.PostRebuildSendPackets = jsonUint64(ingress, "send_packets")
|
|
item.PostRebuildSendFailures = jsonUint64(ingress, "send_route_failures")
|
|
item.PostRebuildSendFlowPackets = jsonUint64(ingress, "send_flow_packets")
|
|
item.PostRebuildSendFlowDropped = jsonUint64(ingress, "send_flow_dropped")
|
|
item.Timeline = append(item.Timeline, FabricServiceChannelRouteRebuildTimelineEvent{
|
|
Stage: "post_rebuild_traffic",
|
|
Status: "observed",
|
|
At: heartbeat.ObservedAt.UTC().Format(time.RFC3339Nano),
|
|
RouteID: selectedRouteID,
|
|
Generation: jsonString(runtime, "config_version"),
|
|
Payload: mustJSONRaw(map[string]any{
|
|
"last_selected_route_id": selectedRouteID,
|
|
"send_packets": item.PostRebuildSendPackets,
|
|
"send_route_failures": item.PostRebuildSendFailures,
|
|
"send_flow_packets": item.PostRebuildSendFlowPackets,
|
|
"send_flow_dropped": item.PostRebuildSendFlowDropped,
|
|
"recommended_parallel": jsonUint64(ingress, "recommended_parallel_flow_sends"),
|
|
}),
|
|
})
|
|
}
|
|
}
|
|
if item.NodeTransitionMatched && item.NodeRouteGenerationMatched && item.PostRebuildSelectedRouteID != "" {
|
|
break
|
|
}
|
|
}
|
|
sort.SliceStable(item.Timeline, func(i, j int) bool {
|
|
left, leftErr := time.Parse(time.RFC3339Nano, item.Timeline[i].At)
|
|
right, rightErr := time.Parse(time.RFC3339Nano, item.Timeline[j].At)
|
|
if leftErr == nil && rightErr == nil && !left.Equal(right) {
|
|
return left.Before(right)
|
|
}
|
|
return item.Timeline[i].Stage < item.Timeline[j].Stage
|
|
})
|
|
item = applyFabricServiceChannelRouteRebuildGuard(item, now)
|
|
return item
|
|
}
|
|
|
|
const (
|
|
fabricServiceChannelRebuildTransitionDeadline = 90 * time.Second
|
|
fabricServiceChannelRebuildTrafficDeadline = 180 * time.Second
|
|
)
|
|
|
|
func applyFabricServiceChannelRouteRebuildGuard(item FabricServiceChannelRouteRebuildAttempt, now time.Time) FabricServiceChannelRouteRebuildAttempt {
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
age := now.Sub(item.UpdatedAt)
|
|
if age < 0 {
|
|
age = 0
|
|
}
|
|
item.GuardAgeSeconds = int64(age / time.Second)
|
|
item.GuardTransitionDeadlineSeconds = int64(fabricServiceChannelRebuildTransitionDeadline / time.Second)
|
|
item.GuardTrafficDeadlineSeconds = int64(fabricServiceChannelRebuildTrafficDeadline / time.Second)
|
|
if item.RebuildStatus == "" {
|
|
item.GuardStatus = "unknown"
|
|
item.GuardSeverity = "warn"
|
|
item.GuardReason = "missing_backend_rebuild_status"
|
|
return item
|
|
}
|
|
if item.RebuildStatus == "pending_degraded_fallback" {
|
|
if item.NodeTransitionMatched {
|
|
item.GuardStatus = "pending_degraded_fallback_seen"
|
|
item.GuardSeverity = "warn"
|
|
item.GuardReason = "node_confirmed_pending_degraded_fallback"
|
|
return item
|
|
}
|
|
if age > fabricServiceChannelRebuildTransitionDeadline {
|
|
item.GuardStatus = "missing_node_transition"
|
|
item.GuardSeverity = "bad"
|
|
item.GuardReason = "node_did_not_report_pending_fallback_transition"
|
|
return item
|
|
}
|
|
item.GuardStatus = "pending_node_transition"
|
|
item.GuardSeverity = "warn"
|
|
item.GuardReason = "waiting_for_node_pending_fallback_transition"
|
|
return item
|
|
}
|
|
if item.RebuildStatus != "applied" {
|
|
item.GuardStatus = "not_applied"
|
|
item.GuardSeverity = "warn"
|
|
item.GuardReason = "backend_rebuild_not_applied"
|
|
return item
|
|
}
|
|
if !item.NodeTransitionMatched {
|
|
if age > fabricServiceChannelRebuildTransitionDeadline {
|
|
item.GuardStatus = "missing_node_transition"
|
|
item.GuardSeverity = "bad"
|
|
item.GuardReason = "node_did_not_report_applied_rebuild_transition"
|
|
return item
|
|
}
|
|
item.GuardStatus = "pending_node_transition"
|
|
item.GuardSeverity = "warn"
|
|
item.GuardReason = "waiting_for_node_applied_rebuild_transition"
|
|
return item
|
|
}
|
|
if !item.NodeRouteGenerationMatched {
|
|
if age > fabricServiceChannelRebuildTransitionDeadline {
|
|
item.GuardStatus = "missing_route_generation"
|
|
item.GuardSeverity = "bad"
|
|
item.GuardReason = "node_transition_seen_but_route_generation_not_correlated"
|
|
return item
|
|
}
|
|
item.GuardStatus = "pending_route_generation"
|
|
item.GuardSeverity = "warn"
|
|
item.GuardReason = "waiting_for_route_generation_correlation"
|
|
return item
|
|
}
|
|
if item.PostRebuildSelectedRouteID == "" {
|
|
if age > fabricServiceChannelRebuildTrafficDeadline {
|
|
item.GuardStatus = "missing_post_rebuild_traffic"
|
|
item.GuardSeverity = "bad"
|
|
item.GuardReason = "no_post_rebuild_traffic_observed"
|
|
return item
|
|
}
|
|
item.GuardStatus = "pending_post_rebuild_traffic"
|
|
item.GuardSeverity = "warn"
|
|
item.GuardReason = "waiting_for_post_rebuild_traffic"
|
|
return item
|
|
}
|
|
if item.ReplacementRouteID != "" && item.PostRebuildSelectedRouteID != item.ReplacementRouteID {
|
|
item.GuardStatus = "unexpected_post_rebuild_route"
|
|
item.GuardSeverity = "bad"
|
|
item.GuardReason = "post_rebuild_selected_route_differs_from_replacement"
|
|
return item
|
|
}
|
|
if item.PostRebuildSendFailures > 0 || item.PostRebuildSendFlowDropped > 0 {
|
|
item.GuardStatus = "post_rebuild_degraded"
|
|
item.GuardSeverity = "warn"
|
|
item.GuardReason = "post_rebuild_traffic_has_failures_or_drops"
|
|
return item
|
|
}
|
|
item.GuardStatus = "ok"
|
|
item.GuardSeverity = "good"
|
|
item.GuardReason = "backend_decision_node_transition_and_post_rebuild_traffic_correlated"
|
|
return item
|
|
}
|
|
|
|
func sortedStringSetKeys(values map[string]struct{}) []string {
|
|
if len(values) == 0 {
|
|
return nil
|
|
}
|
|
out := make([]string, 0, len(values))
|
|
for value := range values {
|
|
out = append(out, value)
|
|
}
|
|
sort.Strings(out)
|
|
return out
|
|
}
|
|
|
|
func applyFabricServiceChannelRouteRebuildAlertSilences(items []FabricServiceChannelRouteRebuildAttempt, silences []FabricServiceChannelRouteRebuildAlertSilence) []FabricServiceChannelRouteRebuildAttempt {
|
|
if len(items) == 0 || len(silences) == 0 {
|
|
return items
|
|
}
|
|
byKey := map[string]FabricServiceChannelRouteRebuildAlertSilence{}
|
|
for _, silence := range silences {
|
|
byKey[fabricServiceChannelRebuildAlertSilenceKey(silence.ReporterNodeID, silence.RouteID, silence.GuardStatus, silence.Generation)] = silence
|
|
}
|
|
for idx := range items {
|
|
item := &items[idx]
|
|
silence, ok := byKey[fabricServiceChannelRebuildAlertSilenceKey(item.ReporterNodeID, item.RouteID, item.GuardStatus, item.Generation)]
|
|
if !ok {
|
|
continue
|
|
}
|
|
item.AlertSilenced = true
|
|
item.AlertSilenceID = silence.ID
|
|
item.AlertSilenceReason = silence.Reason
|
|
item.AlertSilencedUntil = &silence.ExpiresAt
|
|
}
|
|
byResurfaceKey := map[string]FabricServiceChannelRouteRebuildAlertSilence{}
|
|
for _, silence := range silences {
|
|
key := fabricServiceChannelRebuildAlertResurfaceKey(silence.ReporterNodeID, silence.RouteID, silence.GuardStatus)
|
|
current, ok := byResurfaceKey[key]
|
|
if !ok || silence.CreatedAt.After(current.CreatedAt) {
|
|
byResurfaceKey[key] = silence
|
|
}
|
|
}
|
|
for idx := range items {
|
|
item := &items[idx]
|
|
if item.AlertSilenced || (item.GuardSeverity != "bad" && item.GuardSeverity != "warn") {
|
|
continue
|
|
}
|
|
silence, ok := byResurfaceKey[fabricServiceChannelRebuildAlertResurfaceKey(item.ReporterNodeID, item.RouteID, item.GuardStatus)]
|
|
if !ok || strings.TrimSpace(silence.Generation) == strings.TrimSpace(item.Generation) {
|
|
continue
|
|
}
|
|
item.AlertResurfaced = true
|
|
item.AlertResurfacedFromSilenceID = silence.ID
|
|
item.AlertResurfacedPreviousGeneration = silence.Generation
|
|
item.AlertResurfacedPreviousUntil = &silence.ExpiresAt
|
|
}
|
|
return items
|
|
}
|
|
|
|
func fabricServiceChannelRebuildAlertSilenceKey(reporterNodeID, routeID, guardStatus, generation string) string {
|
|
return strings.TrimSpace(reporterNodeID) + "|" + strings.TrimSpace(routeID) + "|" + strings.TrimSpace(guardStatus) + "|" + strings.TrimSpace(generation)
|
|
}
|
|
|
|
func fabricServiceChannelRebuildAlertResurfaceKey(reporterNodeID, routeID, guardStatus string) string {
|
|
return strings.TrimSpace(reporterNodeID) + "|" + strings.TrimSpace(routeID) + "|" + strings.TrimSpace(guardStatus)
|
|
}
|
|
|
|
func fabricServiceChannelReadinessFromRebuildHealth(summary FabricServiceChannelRouteRebuildHealthSummary) FabricServiceChannelReadiness {
|
|
readiness := FabricServiceChannelReadiness{
|
|
ClusterID: summary.ClusterID,
|
|
ObservedAt: summary.ObservedAt,
|
|
Status: "clean",
|
|
Reason: "no_active_service_channel_rebuild_alerts",
|
|
ActiveAlertCount: summary.ActiveBadCount + summary.ActiveWarnCount,
|
|
ActiveBadCount: summary.ActiveBadCount,
|
|
ActiveWarnCount: summary.ActiveWarnCount,
|
|
ResurfacedCount: summary.ResurfacedCount,
|
|
SilencedCount: summary.SilencedCount,
|
|
MissingTransitionCount: summary.CountsByGuardStatus["missing_node_transition"],
|
|
MissingRouteGenerationCount: summary.CountsByGuardStatus["missing_route_generation"],
|
|
MissingPostTrafficCount: summary.CountsByGuardStatus["missing_post_rebuild_traffic"],
|
|
UnexpectedRouteCount: summary.CountsByGuardStatus["unexpected_post_rebuild_route"],
|
|
PostRebuildDegradedCount: summary.CountsByGuardStatus["post_rebuild_degraded"],
|
|
RecommendedOperatorAction: summary.RecommendedOperatorAction,
|
|
}
|
|
if summary.ResurfacedCount > 0 {
|
|
readiness.BlockingReasons = append(readiness.BlockingReasons, "resurfaced_rebuild_alert")
|
|
}
|
|
if summary.ActiveBadCount > 0 {
|
|
readiness.BlockingReasons = append(readiness.BlockingReasons, "active_bad_rebuild_alert")
|
|
}
|
|
if readiness.MissingTransitionCount > 0 {
|
|
readiness.BlockingReasons = append(readiness.BlockingReasons, "missing_node_transition")
|
|
}
|
|
if readiness.MissingRouteGenerationCount > 0 {
|
|
readiness.BlockingReasons = append(readiness.BlockingReasons, "missing_route_generation")
|
|
}
|
|
if readiness.MissingPostTrafficCount > 0 {
|
|
readiness.BlockingReasons = append(readiness.BlockingReasons, "missing_post_rebuild_traffic")
|
|
}
|
|
if readiness.UnexpectedRouteCount > 0 {
|
|
readiness.BlockingReasons = append(readiness.BlockingReasons, "unexpected_post_rebuild_route")
|
|
}
|
|
if readiness.PostRebuildDegradedCount > 0 {
|
|
readiness.DegradedReasons = append(readiness.DegradedReasons, "post_rebuild_degraded")
|
|
}
|
|
if summary.ActiveWarnCount > 0 {
|
|
readiness.DegradedReasons = append(readiness.DegradedReasons, "active_warn_rebuild_alert")
|
|
}
|
|
if summary.PendingCount > 0 {
|
|
readiness.DegradedReasons = append(readiness.DegradedReasons, "pending_rebuild_attempt")
|
|
}
|
|
if summary.SilencedCount > 0 {
|
|
readiness.DegradedReasons = append(readiness.DegradedReasons, "silenced_alert_under_observation")
|
|
}
|
|
if len(readiness.BlockingReasons) > 0 {
|
|
readiness.Status = "blocked"
|
|
readiness.Reason = readiness.BlockingReasons[0]
|
|
return readiness
|
|
}
|
|
if len(readiness.DegradedReasons) > 0 {
|
|
readiness.Status = "degraded"
|
|
readiness.Reason = readiness.DegradedReasons[0]
|
|
}
|
|
return readiness
|
|
}
|
|
|
|
func fabricServiceChannelRebuildRecommendedAction(summary FabricServiceChannelRouteRebuildHealthSummary) string {
|
|
if summary.AccessNoSafeCount > 0 {
|
|
return "inspect_access_no_safe_recovery_route_pool_and_signed_policy"
|
|
}
|
|
if summary.ActiveBadCount > 0 {
|
|
if summary.ResurfacedCount > 0 {
|
|
return "resurfaced_rebuild_alerts_need_reinspection_new_generation_or_route_changed"
|
|
}
|
|
return "inspect_bad_rebuild_attempts_check_reporter_node_heartbeats_route_generation_and_post_rebuild_traffic"
|
|
}
|
|
if summary.ActiveWarnCount > 0 {
|
|
return "watch_pending_rebuild_attempts_until_node_transition_and_post_rebuild_traffic_arrive"
|
|
}
|
|
if summary.SilencedCount > 0 {
|
|
return "no_active_rebuild_alerts_silenced_alerts_remain_under_observation"
|
|
}
|
|
if summary.TotalAttempts == 0 {
|
|
return "no_rebuild_attempts_observed"
|
|
}
|
|
return "no_operator_action_required"
|
|
}
|
|
|
|
func fabricServiceChannelRouteRebuildIncidentsFromAttempts(clusterID string, items []FabricServiceChannelRouteRebuildAttempt) []FabricServiceChannelRouteRebuildIncident {
|
|
byKey := map[string]*FabricServiceChannelRouteRebuildIncident{}
|
|
for _, item := range items {
|
|
guardStatus := firstNonEmptyString(item.GuardStatus, "unknown")
|
|
guardSeverity := firstNonEmptyString(item.GuardSeverity, "unknown")
|
|
key := strings.Join([]string{item.ReporterNodeID, item.RouteID, item.ServiceClass, item.Generation, guardStatus}, "|")
|
|
incident, ok := byKey[key]
|
|
if !ok {
|
|
fingerprint := hashStringHex(key)
|
|
incident = &FabricServiceChannelRouteRebuildIncident{
|
|
Fingerprint: fingerprint,
|
|
ClusterID: clusterID,
|
|
ReporterNodeID: item.ReporterNodeID,
|
|
RouteID: item.RouteID,
|
|
ServiceClass: item.ServiceClass,
|
|
Generation: item.Generation,
|
|
GuardStatus: guardStatus,
|
|
GuardSeverity: guardSeverity,
|
|
GuardReason: item.GuardReason,
|
|
FirstSeenAt: item.CreatedAt,
|
|
LastSeenAt: item.UpdatedAt,
|
|
LatestReplacementRouteID: item.ReplacementRouteID,
|
|
LatestRebuildStatus: item.RebuildStatus,
|
|
LatestOutcome: item.Outcome,
|
|
AlertSilenced: item.AlertSilenced,
|
|
AlertResurfaced: item.AlertResurfaced,
|
|
}
|
|
byKey[key] = incident
|
|
}
|
|
incident.AttemptCount++
|
|
if item.CreatedAt.Before(incident.FirstSeenAt) {
|
|
incident.FirstSeenAt = item.CreatedAt
|
|
}
|
|
if item.UpdatedAt.After(incident.LastSeenAt) {
|
|
incident.LastSeenAt = item.UpdatedAt
|
|
incident.GuardSeverity = guardSeverity
|
|
incident.GuardReason = item.GuardReason
|
|
incident.LatestReplacementRouteID = item.ReplacementRouteID
|
|
incident.LatestRebuildStatus = item.RebuildStatus
|
|
incident.LatestOutcome = item.Outcome
|
|
}
|
|
incident.AlertSilenced = incident.AlertSilenced || item.AlertSilenced
|
|
if item.AlertResurfaced {
|
|
incident.AlertResurfaced = true
|
|
incident.AlertResurfacedFromSilenceID = item.AlertResurfacedFromSilenceID
|
|
incident.AlertResurfacedCause = item.AlertResurfacedCause
|
|
incident.AlertResurfacedPreviousRouteID = item.AlertResurfacedPreviousRouteID
|
|
incident.AlertResurfacedPreviousChannelID = item.AlertResurfacedPreviousChannelID
|
|
incident.AlertResurfacedPreviousGeneration = item.AlertResurfacedPreviousGeneration
|
|
incident.AlertResurfacedPreviousUntil = item.AlertResurfacedPreviousUntil
|
|
}
|
|
}
|
|
out := make([]FabricServiceChannelRouteRebuildIncident, 0, len(byKey))
|
|
for _, incident := range byKey {
|
|
incident.RecommendedOperatorAction = fabricServiceChannelRebuildIncidentRecommendedAction(*incident)
|
|
out = append(out, *incident)
|
|
}
|
|
for idx := range out {
|
|
out[idx].RecommendedOperatorAction = fabricServiceChannelRebuildIncidentRecommendedAction(out[idx])
|
|
}
|
|
fabricServiceChannelSortRouteRebuildIncidents(out)
|
|
return out
|
|
}
|
|
|
|
func fabricServiceChannelSortRouteRebuildIncidents(out []FabricServiceChannelRouteRebuildIncident) {
|
|
sort.SliceStable(out, func(i, j int) bool {
|
|
leftRank := fabricServiceChannelRebuildIncidentSeverityRank(out[i])
|
|
rightRank := fabricServiceChannelRebuildIncidentSeverityRank(out[j])
|
|
if leftRank != rightRank {
|
|
return leftRank > rightRank
|
|
}
|
|
return out[i].LastSeenAt.After(out[j].LastSeenAt)
|
|
})
|
|
}
|
|
|
|
func fabricServiceChannelAccessDecisionIncidents(clusterID string, telemetry FabricServiceChannelAccessTelemetry) []FabricServiceChannelRouteRebuildIncident {
|
|
out := []FabricServiceChannelRouteRebuildIncident{}
|
|
for _, channel := range telemetry.ActiveChannels {
|
|
if channel.RouteDecisionSource == "" {
|
|
continue
|
|
}
|
|
status, severity, reason := fabricServiceChannelAccessDecisionIncidentState(channel)
|
|
if status == "" {
|
|
continue
|
|
}
|
|
key := strings.Join([]string{"access_decision", channel.ChannelID, channel.RouteDecisionRouteID, status, channel.RouteDecisionGeneration}, "|")
|
|
out = append(out, FabricServiceChannelRouteRebuildIncident{
|
|
Fingerprint: hashStringHex(key),
|
|
ClusterID: clusterID,
|
|
ReporterNodeID: channel.SelectedEntryNodeID,
|
|
RouteID: firstNonEmptyString(channel.RouteDecisionRouteID, channel.PrimaryRouteID),
|
|
ServiceClass: channel.ServiceClass,
|
|
Generation: channel.RouteDecisionGeneration,
|
|
IncidentSource: "access_decision",
|
|
ChannelID: channel.ChannelID,
|
|
GuardStatus: status,
|
|
GuardSeverity: severity,
|
|
GuardReason: reason,
|
|
AttemptCount: 1,
|
|
FirstSeenAt: telemetry.ObservedAt,
|
|
LastSeenAt: telemetry.ObservedAt,
|
|
LatestReplacementRouteID: channel.RouteDecisionReplacementRouteID,
|
|
LatestRebuildStatus: channel.RouteDecisionRebuildStatus,
|
|
LatestOutcome: channel.RouteDecisionSource,
|
|
})
|
|
}
|
|
for idx := range out {
|
|
out[idx].RecommendedOperatorAction = fabricServiceChannelRebuildIncidentRecommendedAction(out[idx])
|
|
}
|
|
fabricServiceChannelSortRouteRebuildIncidents(out)
|
|
return out
|
|
}
|
|
|
|
func fabricServiceChannelDataPlaneContractIncidents(clusterID string, telemetry FabricServiceChannelAccessTelemetry) []FabricServiceChannelRouteRebuildIncident {
|
|
out := []FabricServiceChannelRouteRebuildIncident{}
|
|
for _, channel := range telemetry.ActiveChannels {
|
|
status, severity, reason := fabricServiceChannelDataPlaneContractIncidentState(channel)
|
|
if status == "" {
|
|
continue
|
|
}
|
|
routeID := firstNonEmptyString(channel.RouteDecisionRouteID, channel.PrimaryRouteID, "data_plane")
|
|
generation := firstNonEmptyString(channel.RouteDecisionGeneration, channel.PrimaryRouteID, channel.DataPlane.BackendRelayPolicy, channel.ChannelID)
|
|
key := strings.Join([]string{"data_plane_contract", channel.ChannelID, routeID, status, generation}, "|")
|
|
out = append(out, FabricServiceChannelRouteRebuildIncident{
|
|
Fingerprint: hashStringHex(key),
|
|
ClusterID: clusterID,
|
|
ReporterNodeID: channel.SelectedEntryNodeID,
|
|
RouteID: routeID,
|
|
ServiceClass: channel.ServiceClass,
|
|
Generation: generation,
|
|
IncidentSource: "data_plane_contract",
|
|
ChannelID: channel.ChannelID,
|
|
GuardStatus: status,
|
|
GuardSeverity: severity,
|
|
GuardReason: reason,
|
|
AttemptCount: 1,
|
|
FirstSeenAt: telemetry.ObservedAt,
|
|
LastSeenAt: telemetry.ObservedAt,
|
|
LatestOutcome: firstNonEmptyString(channel.EntryNodeLastWorkingDataTransport, channel.DataPlane.WorkingDataTransport, "unknown"),
|
|
LatestRebuildStatus: firstNonEmptyString(
|
|
channel.EntryNodeLastBackendRelayPolicy,
|
|
channel.DataPlane.BackendRelayPolicy,
|
|
),
|
|
})
|
|
}
|
|
for idx := range out {
|
|
out[idx].RecommendedOperatorAction = fabricServiceChannelRebuildIncidentRecommendedAction(out[idx])
|
|
}
|
|
fabricServiceChannelSortRouteRebuildIncidents(out)
|
|
return out
|
|
}
|
|
|
|
func applyFabricServiceChannelAccessDecisionIncidentSilences(items []FabricServiceChannelRouteRebuildIncident, silences []FabricServiceChannelRouteRebuildAlertSilence) []FabricServiceChannelRouteRebuildIncident {
|
|
if len(items) == 0 || len(silences) == 0 {
|
|
return items
|
|
}
|
|
byKey := map[string]FabricServiceChannelRouteRebuildAlertSilence{}
|
|
byResurfaceKey := map[string]FabricServiceChannelRouteRebuildAlertSilence{}
|
|
byGeneralResurfaceKey := map[string]FabricServiceChannelRouteRebuildAlertSilence{}
|
|
byAccessReporterGuard := map[string]FabricServiceChannelRouteRebuildAlertSilence{}
|
|
for _, silence := range silences {
|
|
byKey[fabricServiceChannelRebuildAlertSilenceKey(silence.ReporterNodeID, silence.RouteID, silence.GuardStatus, silence.Generation)] = silence
|
|
resurfaceKey := fabricServiceChannelRebuildAlertResurfaceKey(silence.ReporterNodeID, silence.RouteID, silence.GuardStatus)
|
|
current, ok := byResurfaceKey[resurfaceKey]
|
|
if !ok || silence.CreatedAt.After(current.CreatedAt) {
|
|
byResurfaceKey[resurfaceKey] = silence
|
|
}
|
|
if channelID, routeID, ok := fabricServiceChannelParseAccessDecisionSilenceRouteID(silence.RouteID); ok {
|
|
_ = channelID
|
|
generalKey := fabricServiceChannelRebuildAlertResurfaceKey(silence.ReporterNodeID, routeID, silence.GuardStatus)
|
|
current, ok := byGeneralResurfaceKey[generalKey]
|
|
if !ok || silence.CreatedAt.After(current.CreatedAt) {
|
|
byGeneralResurfaceKey[generalKey] = silence
|
|
}
|
|
accessKey := fabricServiceChannelRebuildAlertResurfaceKey(silence.ReporterNodeID, "access_decision", silence.GuardStatus)
|
|
current, ok = byAccessReporterGuard[accessKey]
|
|
if !ok || silence.CreatedAt.After(current.CreatedAt) {
|
|
byAccessReporterGuard[accessKey] = silence
|
|
}
|
|
}
|
|
}
|
|
for idx := range items {
|
|
item := &items[idx]
|
|
silenceRouteID := fabricServiceChannelAccessDecisionSilenceRouteID(item.ChannelID, item.RouteID)
|
|
silence, ok := byKey[fabricServiceChannelRebuildAlertSilenceKey(item.ReporterNodeID, silenceRouteID, item.GuardStatus, item.Generation)]
|
|
if ok {
|
|
item.AlertSilenced = true
|
|
continue
|
|
}
|
|
if item.GuardSeverity != "bad" && item.GuardSeverity != "warn" {
|
|
continue
|
|
}
|
|
silence, ok = byResurfaceKey[fabricServiceChannelRebuildAlertResurfaceKey(item.ReporterNodeID, silenceRouteID, item.GuardStatus)]
|
|
if !ok || strings.TrimSpace(silence.Generation) == strings.TrimSpace(item.Generation) {
|
|
generalSilence, generalOK := byGeneralResurfaceKey[fabricServiceChannelRebuildAlertResurfaceKey(item.ReporterNodeID, item.RouteID, item.GuardStatus)]
|
|
if !generalOK || strings.TrimSpace(generalSilence.Generation) == strings.TrimSpace(item.Generation) {
|
|
accessSilence, accessOK := byAccessReporterGuard[fabricServiceChannelRebuildAlertResurfaceKey(item.ReporterNodeID, "access_decision", item.GuardStatus)]
|
|
if !accessOK || !fabricServiceChannelAccessDecisionSilenceDiffers(*item, accessSilence) {
|
|
continue
|
|
}
|
|
generalSilence = accessSilence
|
|
}
|
|
silence = generalSilence
|
|
}
|
|
item.AlertResurfaced = true
|
|
item.AlertResurfacedFromSilenceID = silence.ID
|
|
item.AlertResurfacedCause = fabricServiceChannelAccessDecisionResurfaceCause(*item, silence)
|
|
item.AlertResurfacedPreviousRouteID = silence.DisplayRouteID
|
|
item.AlertResurfacedPreviousChannelID = silence.ChannelID
|
|
item.AlertResurfacedPreviousGeneration = silence.Generation
|
|
item.AlertResurfacedPreviousUntil = &silence.ExpiresAt
|
|
}
|
|
return items
|
|
}
|
|
|
|
func fabricServiceChannelAccessDecisionSilenceDiffers(item FabricServiceChannelRouteRebuildIncident, silence FabricServiceChannelRouteRebuildAlertSilence) bool {
|
|
return strings.TrimSpace(silence.ChannelID) != strings.TrimSpace(item.ChannelID) ||
|
|
strings.TrimSpace(silence.DisplayRouteID) != strings.TrimSpace(item.RouteID) ||
|
|
strings.TrimSpace(silence.Generation) != strings.TrimSpace(item.Generation)
|
|
}
|
|
|
|
func fabricServiceChannelAccessDecisionResurfaceCause(item FabricServiceChannelRouteRebuildIncident, silence FabricServiceChannelRouteRebuildAlertSilence) string {
|
|
if strings.TrimSpace(silence.ChannelID) != "" && strings.TrimSpace(silence.ChannelID) != strings.TrimSpace(item.ChannelID) {
|
|
return "channel_changed"
|
|
}
|
|
if strings.TrimSpace(silence.DisplayRouteID) != "" && strings.TrimSpace(silence.DisplayRouteID) != strings.TrimSpace(item.RouteID) {
|
|
return "route_changed"
|
|
}
|
|
if strings.TrimSpace(silence.Generation) != strings.TrimSpace(item.Generation) {
|
|
return "generation_changed"
|
|
}
|
|
return "resurfaced"
|
|
}
|
|
|
|
func fabricServiceChannelAccessDecisionSilenceRouteID(channelID string, routeID string) string {
|
|
return "access:" + strings.TrimSpace(channelID) + ":" + strings.TrimSpace(routeID)
|
|
}
|
|
|
|
func fabricServiceChannelParseAccessDecisionSilenceRouteID(value string) (string, string, bool) {
|
|
value = strings.TrimSpace(value)
|
|
if !strings.HasPrefix(value, "access:") {
|
|
return "", "", false
|
|
}
|
|
rest := strings.TrimPrefix(value, "access:")
|
|
parts := strings.SplitN(rest, ":", 2)
|
|
if len(parts) != 2 || strings.TrimSpace(parts[0]) == "" || strings.TrimSpace(parts[1]) == "" {
|
|
return "", "", false
|
|
}
|
|
return strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1]), true
|
|
}
|
|
|
|
func fabricServiceChannelAccessDecisionIncidentState(channel FabricServiceChannelAccessTelemetryChannel) (string, string, string) {
|
|
switch {
|
|
case fabricServiceChannelRouteDecisionIsNoSafeRecovery(channel):
|
|
return "access_no_safe_recovery", "bad", firstNonEmptyString(channel.RouteDecisionRebuildReason, "no_unfenced_alternate_route")
|
|
case fabricServiceChannelRouteDecisionIsRecovery(channel):
|
|
return "access_recovery_selected", "warn", firstNonEmptyString(channel.RouteDecisionRebuildReason, "recovery_route_selected")
|
|
case channel.RouteDecisionRebuildStatus == "applied" || containsString(channel.RouteDecisionScoreReasons, "service_channel_rebuild_applied"):
|
|
return "access_rebuild_applied", "good", firstNonEmptyString(channel.RouteDecisionRebuildReason, "planner_applied_rebuild")
|
|
case fabricServiceChannelRouteDecisionIsReplacement(channel):
|
|
return "access_replacement_selected", "warn", firstNonEmptyString(channel.RouteDecisionRebuildReason, "replacement_route_selected")
|
|
default:
|
|
return "", "", ""
|
|
}
|
|
}
|
|
|
|
func fabricServiceChannelDataPlaneContractIncidentState(channel FabricServiceChannelAccessTelemetryChannel) (string, string, string) {
|
|
accepted := channel.EntryNodeTotalAccepted > 0 || channel.EntryNodeIntrospectionAccepted > 0 || channel.EntryNodeBackendFallbackCount > 0
|
|
if accepted && channel.EntryNodeDataPlaneContractCount == 0 {
|
|
return "data_plane_contract_not_reported", "bad", "entry_node_accepted_service_channel_without_reporting_data_plane_contract"
|
|
}
|
|
workingTransport := firstNonEmptyString(channel.EntryNodeLastWorkingDataTransport, channel.DataPlane.WorkingDataTransport)
|
|
if workingTransport != "" && workingTransport != "fabric_service_channel" {
|
|
return "data_plane_working_transport_violation", "bad", "working_data_transport_must_be_fabric_service_channel"
|
|
}
|
|
steadyTransport := firstNonEmptyString(channel.EntryNodeLastSteadyStateTransport, channel.DataPlane.SteadyStateTransport)
|
|
if steadyTransport != "" && steadyTransport != "fabric_route" {
|
|
return "data_plane_steady_state_transport_violation", "bad", "steady_state_transport_must_be_fabric_route"
|
|
}
|
|
logicalFlowMode := firstNonEmptyString(channel.EntryNodeLastLogicalFlowMode, channel.DataPlane.LogicalFlowMode)
|
|
if logicalFlowMode != "" && logicalFlowMode != "multi_flow_isolated" {
|
|
return "data_plane_logical_flow_violation", "bad", "logical_flow_mode_must_be_multi_flow_isolated"
|
|
}
|
|
backendRelayPolicy := firstNonEmptyString(channel.EntryNodeLastBackendRelayPolicy, channel.DataPlane.BackendRelayPolicy)
|
|
if channel.EntryNodeBackendFallbackBlockedCount > 0 {
|
|
return firstNonEmptyString(channel.EntryNodeLastDataPlaneViolationStatus, "data_plane_backend_fallback_blocked"), "bad", firstNonEmptyString(channel.EntryNodeLastDataPlaneViolationReason, "backend_fallback_blocked_by_data_plane_policy")
|
|
}
|
|
if channel.EntryNodeFabricRouteSendFailureCount > 0 {
|
|
return firstNonEmptyString(channel.EntryNodeLastDataPlaneViolationStatus, "data_plane_fabric_route_send_failed"), "bad", firstNonEmptyString(channel.EntryNodeLastDataPlaneViolationReason, "fabric_route_send_failed")
|
|
}
|
|
if backendRelayPolicy == "disabled" && (channel.EntryNodeBackendFallbackCount > 0 || channel.ForceBackendFallback) {
|
|
return "data_plane_disabled_backend_relay_observed", "bad", "backend_relay_policy_disabled_but_backend_fallback_was_observed"
|
|
}
|
|
if backendRelayPolicy == "degraded_fallback_only" && channel.EntryNodeBackendFallbackCount > 0 {
|
|
return "data_plane_degraded_backend_relay_observed", "warn", "backend_relay_used_as_degraded_fallback_for_working_data"
|
|
}
|
|
return "", "", ""
|
|
}
|
|
|
|
func hashStringHex(value string) string {
|
|
sum := sha256.Sum256([]byte(value))
|
|
return hex.EncodeToString(sum[:])
|
|
}
|
|
|
|
func fabricServiceChannelRebuildIncidentSeverityRank(item FabricServiceChannelRouteRebuildIncident) int {
|
|
if item.AlertResurfaced {
|
|
return 4
|
|
}
|
|
if item.IncidentSource == "access_decision" && item.GuardStatus == "access_no_safe_recovery" {
|
|
return 4
|
|
}
|
|
switch item.GuardSeverity {
|
|
case "bad":
|
|
return 3
|
|
case "warn":
|
|
return 2
|
|
case "good":
|
|
return 1
|
|
default:
|
|
return 0
|
|
}
|
|
}
|
|
|
|
func fabricServiceChannelRebuildIncidentRecommendedAction(item FabricServiceChannelRouteRebuildIncident) string {
|
|
if item.AlertSilenced && !item.AlertResurfaced {
|
|
return "silenced_rebuild_incident_under_observation"
|
|
}
|
|
if item.AlertResurfaced {
|
|
return "open_deep_ledger_for_resurfaced_generation"
|
|
}
|
|
if item.IncidentSource == "access_decision" {
|
|
switch item.GuardStatus {
|
|
case "access_no_safe_recovery":
|
|
return "inspect_access_no_safe_recovery_route_pool_and_signed_policy"
|
|
case "access_recovery_selected":
|
|
return "watch_recovery_route_quality_and_confirm_post_recovery_traffic"
|
|
case "access_rebuild_applied":
|
|
return "confirm_applied_rebuild_runtime_traffic_stays_on_replacement"
|
|
case "access_replacement_selected":
|
|
return "watch_replacement_route_quality_until_applied_or_recovered"
|
|
}
|
|
}
|
|
if item.IncidentSource == "data_plane_contract" {
|
|
switch item.GuardStatus {
|
|
case "data_plane_contract_not_reported":
|
|
return "upgrade_or_restart_entry_node_until_data_plane_contract_is_reported"
|
|
case "data_plane_working_transport_violation", "data_plane_steady_state_transport_violation", "data_plane_logical_flow_violation":
|
|
return "inspect_signed_data_plane_contract_and_node_agent_runtime_path"
|
|
case "data_plane_disabled_backend_relay_observed":
|
|
return "stop_backend_relay_usage_and_restore_fabric_route_before_service_traffic"
|
|
case "data_plane_degraded_backend_relay_observed":
|
|
return "restore_fabric_route_and_treat_backend_relay_as_degraded_only"
|
|
case "backend_fallback_blocked_by_policy", "fabric_route_send_failed_backend_fallback_blocked", "data_plane_backend_fallback_blocked":
|
|
return "restore_fabric_route_or_change_signed_backend_relay_policy_before_retry"
|
|
case "data_plane_fabric_route_send_failed":
|
|
return "inspect_entry_route_runtime_and_restore_fabric_route_delivery"
|
|
}
|
|
}
|
|
switch item.GuardStatus {
|
|
case "missing_node_transition":
|
|
return "open_deep_ledger_check_reporter_heartbeats_and_route_manager_transition"
|
|
case "missing_route_generation":
|
|
return "open_deep_ledger_check_route_generation_apply_or_withdraw"
|
|
case "missing_post_rebuild_traffic":
|
|
return "open_deep_ledger_check_post_rebuild_traffic_and_selected_route"
|
|
case "unexpected_post_rebuild_route":
|
|
return "open_deep_ledger_check_selected_route_vs_replacement"
|
|
case "post_rebuild_degraded":
|
|
return "inspect_post_rebuild_drops_failures_and_route_quality"
|
|
case "ok":
|
|
return "no_operator_action_required"
|
|
default:
|
|
if item.GuardSeverity == "bad" || item.GuardSeverity == "warn" {
|
|
return "open_deep_ledger_for_rebuild_incident"
|
|
}
|
|
return "no_operator_action_required"
|
|
}
|
|
}
|
|
|
|
func transitionMatchesRebuildAttempt(transition map[string]any, item FabricServiceChannelRouteRebuildAttempt) bool {
|
|
if len(transition) == 0 {
|
|
return false
|
|
}
|
|
generation := jsonString(transition, "generation")
|
|
if item.Generation != "" {
|
|
return generation != "" && generation == item.Generation
|
|
}
|
|
status := jsonString(transition, "status")
|
|
return (status == "applied_rebuild" && item.RebuildStatus == "applied") ||
|
|
(status == "pending_degraded_fallback" && item.RebuildStatus == "pending_degraded_fallback")
|
|
}
|
|
|
|
func routeGenerationDecisionForAttempt(report map[string]any, item FabricServiceChannelRouteRebuildAttempt) (map[string]any, bool) {
|
|
for _, key := range []string{"active_decisions", "withdrawn_decisions"} {
|
|
for _, raw := range jsonArray(report, key) {
|
|
decision, ok := raw.(map[string]any)
|
|
if !ok {
|
|
continue
|
|
}
|
|
if jsonString(decision, "route_id") != item.RouteID {
|
|
continue
|
|
}
|
|
generation := jsonString(decision, "generation")
|
|
if item.Generation == "" || generation == "" || generation == item.Generation {
|
|
return decision, true
|
|
}
|
|
}
|
|
}
|
|
return nil, false
|
|
}
|
|
|
|
func jsonObject(raw json.RawMessage) map[string]any {
|
|
if len(raw) == 0 || !json.Valid(raw) {
|
|
return map[string]any{}
|
|
}
|
|
var out map[string]any
|
|
if err := json.Unmarshal(raw, &out); err != nil {
|
|
return map[string]any{}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func jsonMapPath(raw map[string]any, path ...string) map[string]any {
|
|
current := raw
|
|
for _, key := range path {
|
|
next, ok := current[key].(map[string]any)
|
|
if !ok {
|
|
return map[string]any{}
|
|
}
|
|
current = next
|
|
}
|
|
return current
|
|
}
|
|
|
|
func jsonArray(raw map[string]any, key string) []any {
|
|
if raw == nil {
|
|
return nil
|
|
}
|
|
items, _ := raw[key].([]any)
|
|
return items
|
|
}
|
|
|
|
func jsonString(raw map[string]any, key string) string {
|
|
if raw == nil {
|
|
return ""
|
|
}
|
|
value, _ := raw[key].(string)
|
|
return strings.TrimSpace(value)
|
|
}
|
|
|
|
func jsonStringArray(raw map[string]any, key string) []string {
|
|
items := jsonArray(raw, key)
|
|
if len(items) == 0 {
|
|
return nil
|
|
}
|
|
out := make([]string, 0, len(items))
|
|
for _, item := range items {
|
|
value, ok := item.(string)
|
|
if !ok {
|
|
continue
|
|
}
|
|
value = strings.TrimSpace(value)
|
|
if value != "" {
|
|
out = append(out, value)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func jsonInt(raw map[string]any, key string) int {
|
|
if raw == nil {
|
|
return 0
|
|
}
|
|
switch value := raw[key].(type) {
|
|
case float64:
|
|
return int(value)
|
|
case int:
|
|
return value
|
|
case int64:
|
|
return int(value)
|
|
case json.Number:
|
|
parsed, _ := value.Int64()
|
|
return int(parsed)
|
|
default:
|
|
return 0
|
|
}
|
|
}
|
|
|
|
func jsonBool(raw map[string]any, key string) bool {
|
|
if raw == nil {
|
|
return false
|
|
}
|
|
value, _ := raw[key].(bool)
|
|
return value
|
|
}
|
|
|
|
func jsonStringIntMap(raw map[string]any, key string) map[string]int {
|
|
if raw == nil {
|
|
return nil
|
|
}
|
|
values, ok := raw[key].(map[string]any)
|
|
if !ok || len(values) == 0 {
|
|
return nil
|
|
}
|
|
out := make(map[string]int, len(values))
|
|
for name, value := range values {
|
|
name = strings.TrimSpace(name)
|
|
if name == "" {
|
|
continue
|
|
}
|
|
switch typed := value.(type) {
|
|
case float64:
|
|
out[name] = int(typed)
|
|
case int:
|
|
out[name] = typed
|
|
case int64:
|
|
out[name] = int(typed)
|
|
case json.Number:
|
|
parsed, _ := typed.Int64()
|
|
out[name] = int(parsed)
|
|
}
|
|
}
|
|
if len(out) == 0 {
|
|
return nil
|
|
}
|
|
return out
|
|
}
|
|
|
|
func copyStringIntMap(values map[string]int) map[string]int {
|
|
if len(values) == 0 {
|
|
return nil
|
|
}
|
|
out := make(map[string]int, len(values))
|
|
for key, value := range values {
|
|
out[key] = value
|
|
}
|
|
return out
|
|
}
|
|
|
|
func mergeStringIntMap(target map[string]int, source map[string]int) {
|
|
if target == nil || len(source) == 0 {
|
|
return
|
|
}
|
|
for key, value := range source {
|
|
target[key] += value
|
|
}
|
|
}
|
|
|
|
func mergeMinStringIntMap(target map[string]int, source map[string]int) {
|
|
if target == nil || len(source) == 0 {
|
|
return
|
|
}
|
|
for key, value := range source {
|
|
if strings.TrimSpace(key) == "" || value <= 0 {
|
|
continue
|
|
}
|
|
current, ok := target[key]
|
|
if !ok || value < current {
|
|
target[key] = value
|
|
}
|
|
}
|
|
}
|
|
|
|
func jsonUint64(raw map[string]any, key string) uint64 {
|
|
if raw == nil {
|
|
return 0
|
|
}
|
|
switch value := raw[key].(type) {
|
|
case float64:
|
|
if value > 0 {
|
|
return uint64(value)
|
|
}
|
|
case int:
|
|
if value > 0 {
|
|
return uint64(value)
|
|
}
|
|
case int64:
|
|
if value > 0 {
|
|
return uint64(value)
|
|
}
|
|
case uint64:
|
|
return value
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func (s *Service) ExpireFabricServiceChannelRouteFeedback(ctx context.Context, input ExpireFabricServiceChannelRouteFeedbackInput) (ExpireFabricServiceChannelRouteFeedbackResult, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return ExpireFabricServiceChannelRouteFeedbackResult{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return ExpireFabricServiceChannelRouteFeedbackResult{}, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.ReporterNodeID = strings.TrimSpace(input.ReporterNodeID)
|
|
input.RouteID = strings.TrimSpace(input.RouteID)
|
|
input.ServiceClass = strings.TrimSpace(input.ServiceClass)
|
|
input.Reason = strings.TrimSpace(input.Reason)
|
|
if input.ClusterID == "" || input.RouteID == "" {
|
|
return ExpireFabricServiceChannelRouteFeedbackResult{}, ErrInvalidPayload
|
|
}
|
|
if input.Now.IsZero() {
|
|
input.Now = s.now()
|
|
}
|
|
result, err := s.store.ExpireFabricServiceChannelRouteFeedback(ctx, input)
|
|
if err != nil {
|
|
return ExpireFabricServiceChannelRouteFeedbackResult{}, err
|
|
}
|
|
payload, _ := json.Marshal(map[string]any{
|
|
"reporter_node_id": input.ReporterNodeID,
|
|
"route_id": input.RouteID,
|
|
"service_class": input.ServiceClass,
|
|
"reason": input.Reason,
|
|
"expired_count": result.ExpiredCount,
|
|
"expired_at": result.ExpiredAt,
|
|
"cooldown_until": result.CooldownUntil,
|
|
})
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "fabric.service_channel_route_feedback.expired",
|
|
TargetType: "fabric_service_channel_route",
|
|
TargetID: &input.RouteID,
|
|
Payload: payload,
|
|
CreatedAt: input.Now.UTC(),
|
|
})
|
|
return result, nil
|
|
}
|
|
|
|
func (s *Service) CreateReleaseVersion(ctx context.Context, input CreateReleaseVersionInput) (ReleaseVersion, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return ReleaseVersion{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return ReleaseVersion{}, err
|
|
}
|
|
input.Product = normalizeUpdateToken(input.Product)
|
|
input.Version = strings.TrimSpace(input.Version)
|
|
input.Channel = normalizeUpdateToken(firstNonEmptyString(input.Channel, "dev"))
|
|
input.Status = normalizeUpdateToken(firstNonEmptyString(input.Status, "active"))
|
|
if input.ClusterID == "" || input.Product == "" || input.Version == "" || len(input.Artifacts) == 0 {
|
|
return ReleaseVersion{}, ErrInvalidPayload
|
|
}
|
|
if input.Status != "active" && input.Status != "draft" && input.Status != "revoked" {
|
|
return ReleaseVersion{}, ErrInvalidPayload
|
|
}
|
|
input.Compatibility = defaultJSON(input.Compatibility, `{}`)
|
|
if !json.Valid(input.Compatibility) {
|
|
return ReleaseVersion{}, ErrInvalidPayload
|
|
}
|
|
for i := range input.Artifacts {
|
|
input.Artifacts[i].OS = normalizeUpdateToken(input.Artifacts[i].OS)
|
|
input.Artifacts[i].Arch = normalizeUpdateToken(input.Artifacts[i].Arch)
|
|
input.Artifacts[i].InstallType = normalizeUpdateToken(input.Artifacts[i].InstallType)
|
|
input.Artifacts[i].Kind = normalizeUpdateToken(input.Artifacts[i].Kind)
|
|
input.Artifacts[i].URL = strings.TrimSpace(input.Artifacts[i].URL)
|
|
input.Artifacts[i].SHA256 = strings.TrimSpace(input.Artifacts[i].SHA256)
|
|
input.Artifacts[i].Metadata = defaultJSON(input.Artifacts[i].Metadata, `{}`)
|
|
if input.Artifacts[i].OS == "" || input.Artifacts[i].Arch == "" || input.Artifacts[i].InstallType == "" ||
|
|
input.Artifacts[i].Kind == "" || input.Artifacts[i].URL == "" || input.Artifacts[i].SHA256 == "" ||
|
|
!json.Valid(input.Artifacts[i].Metadata) {
|
|
return ReleaseVersion{}, ErrInvalidPayload
|
|
}
|
|
}
|
|
item, err := s.store.CreateReleaseVersion(ctx, input)
|
|
if err != nil {
|
|
return ReleaseVersion{}, err
|
|
}
|
|
item, err = s.signReleaseVersion(ctx, item, &input.ActorUserID)
|
|
if err != nil {
|
|
return ReleaseVersion{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "release_version.created",
|
|
TargetType: "release_version",
|
|
TargetID: &item.ID,
|
|
Payload: json.RawMessage(`{"production_forwarding":false}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) ListReleaseVersions(ctx context.Context, actorUserID, clusterID, product, channel string) ([]ReleaseVersion, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListReleaseVersions(ctx, clusterID, normalizeUpdateToken(product), normalizeUpdateToken(channel))
|
|
}
|
|
|
|
func (s *Service) UpsertNodeUpdatePolicy(ctx context.Context, input UpsertNodeUpdatePolicyInput) (NodeUpdatePolicy, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return NodeUpdatePolicy{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return NodeUpdatePolicy{}, err
|
|
}
|
|
input.Product = normalizeUpdateToken(input.Product)
|
|
input.Channel = normalizeUpdateToken(firstNonEmptyString(input.Channel, "dev"))
|
|
input.Strategy = normalizeUpdateToken(firstNonEmptyString(input.Strategy, "manual"))
|
|
if input.ClusterID == "" || input.NodeID == "" || input.Product == "" {
|
|
return NodeUpdatePolicy{}, ErrInvalidPayload
|
|
}
|
|
switch input.Strategy {
|
|
case "manual", "canary", "rolling", "pinned":
|
|
default:
|
|
return NodeUpdatePolicy{}, ErrInvalidPayload
|
|
}
|
|
if input.HealthWindowSec <= 0 {
|
|
input.HealthWindowSec = 180
|
|
}
|
|
if input.TargetVersion != nil {
|
|
trimmed := strings.TrimSpace(*input.TargetVersion)
|
|
input.TargetVersion = &trimmed
|
|
}
|
|
item, err := s.store.UpsertNodeUpdatePolicy(ctx, input)
|
|
if err != nil {
|
|
return NodeUpdatePolicy{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "node_update_policy.updated",
|
|
TargetType: "node",
|
|
TargetID: &input.NodeID,
|
|
Payload: json.RawMessage(`{"production_forwarding":false}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) GetNodeUpdatePlan(ctx context.Context, input GetNodeUpdatePlanInput) (NodeUpdatePlan, error) {
|
|
input.Product = normalizeUpdateToken(firstNonEmptyString(input.Product, "rap-node-agent"))
|
|
input.Channel = normalizeUpdateToken(input.Channel)
|
|
input.OS = normalizeUpdateToken(input.OS)
|
|
input.Arch = normalizeUpdateToken(input.Arch)
|
|
input.InstallType = normalizeUpdateToken(input.InstallType)
|
|
input.CurrentVersion = strings.TrimSpace(input.CurrentVersion)
|
|
input.ArtifactOrigin = normalizeArtifactOrigin(input.ArtifactOrigin)
|
|
if input.ClusterID == "" || input.NodeID == "" || input.Product == "" || input.OS == "" || input.Arch == "" || input.InstallType == "" {
|
|
return NodeUpdatePlan{}, ErrInvalidPayload
|
|
}
|
|
policy, err := s.store.GetNodeUpdatePolicy(ctx, input.ClusterID, input.NodeID, input.Product)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return s.signNodeUpdatePlan(ctx, NodeUpdatePlan{
|
|
SchemaVersion: "rap.node_update_plan.v1",
|
|
ClusterID: input.ClusterID,
|
|
NodeID: input.NodeID,
|
|
Product: input.Product,
|
|
CurrentVersion: input.CurrentVersion,
|
|
Action: "none",
|
|
Reason: "no_update_policy",
|
|
ProductionForwarding: false,
|
|
})
|
|
}
|
|
if err != nil {
|
|
return NodeUpdatePlan{}, err
|
|
}
|
|
if input.Channel == "" {
|
|
input.Channel = policy.Channel
|
|
}
|
|
base := NodeUpdatePlan{
|
|
SchemaVersion: "rap.node_update_plan.v1",
|
|
ClusterID: input.ClusterID,
|
|
NodeID: input.NodeID,
|
|
Product: input.Product,
|
|
CurrentVersion: input.CurrentVersion,
|
|
Channel: input.Channel,
|
|
Strategy: policy.Strategy,
|
|
RollbackAllowed: policy.RollbackAllowed,
|
|
HealthWindowSec: policy.HealthWindowSec,
|
|
ProductionForwarding: false,
|
|
}
|
|
if !policy.Enabled {
|
|
base.Action = "none"
|
|
base.Reason = "policy_disabled"
|
|
return s.signNodeUpdatePlan(ctx, base)
|
|
}
|
|
if mismatch, err := s.hostAgentPlatformMismatch(ctx, input); err != nil {
|
|
return NodeUpdatePlan{}, err
|
|
} else if mismatch {
|
|
base.Action = "none"
|
|
base.Reason = "host_agent_artifact_platform_mismatch"
|
|
return s.signNodeUpdatePlan(ctx, base)
|
|
}
|
|
releases, err := s.store.ListReleaseVersions(ctx, input.ClusterID, input.Product, input.Channel)
|
|
if err != nil {
|
|
return NodeUpdatePlan{}, err
|
|
}
|
|
release, artifact, ok := selectReleaseArtifact(releases, input, policy)
|
|
if !ok {
|
|
base.Action = "none"
|
|
base.Reason = "no_matching_artifact"
|
|
return s.signNodeUpdatePlan(ctx, base)
|
|
}
|
|
base.TargetVersion = release.Version
|
|
artifact = absolutizeReleaseArtifact(artifact, input.ArtifactOrigin)
|
|
base.Artifact = &artifact
|
|
if strings.TrimSpace(input.CurrentVersion) == release.Version {
|
|
base.Action = "none"
|
|
base.Reason = "already_current"
|
|
return s.signNodeUpdatePlan(ctx, base)
|
|
}
|
|
base.Action = "update"
|
|
base.Reason = "matching_release_available"
|
|
return s.signNodeUpdatePlan(ctx, base)
|
|
}
|
|
|
|
func (s *Service) ReportNodeUpdateStatus(ctx context.Context, input ReportNodeUpdateStatusInput) (NodeUpdateStatus, error) {
|
|
input.Product = normalizeUpdateToken(firstNonEmptyString(input.Product, "rap-node-agent"))
|
|
input.Phase = normalizeUpdateToken(input.Phase)
|
|
input.Status = normalizeUpdateToken(input.Status)
|
|
if input.ClusterID == "" || input.NodeID == "" || input.Product == "" || input.Phase == "" || input.Status == "" {
|
|
return NodeUpdateStatus{}, ErrInvalidPayload
|
|
}
|
|
input.Payload = defaultJSON(input.Payload, `{}`)
|
|
if !json.Valid(input.Payload) {
|
|
return NodeUpdateStatus{}, ErrInvalidPayload
|
|
}
|
|
if input.ObservedAt.IsZero() {
|
|
input.ObservedAt = s.now()
|
|
}
|
|
return s.store.ReportNodeUpdateStatus(ctx, input)
|
|
}
|
|
|
|
func (s *Service) ListNodeUpdateStatuses(ctx context.Context, actorUserID, clusterID, nodeID string, limit int) ([]NodeUpdateStatus, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
if clusterID == "" || nodeID == "" {
|
|
return nil, ErrInvalidPayload
|
|
}
|
|
return s.store.ListNodeUpdateStatuses(ctx, clusterID, nodeID, limit)
|
|
}
|
|
|
|
func (s *Service) GetNodeUpdateHint(ctx context.Context, clusterID, nodeID string) NodeUpdateHint {
|
|
products := []string{"rap-node-agent", "rap-host-agent"}
|
|
parts := make([]string, 0, len(products))
|
|
activeProducts := make([]string, 0, len(products))
|
|
updateService := s.selectNodeUpdateService(ctx, clusterID, nodeID)
|
|
for _, product := range products {
|
|
policy, err := s.store.GetNodeUpdatePolicy(ctx, clusterID, nodeID, product)
|
|
if err != nil || !policy.Enabled {
|
|
continue
|
|
}
|
|
targetVersion := strings.TrimSpace(updateHintTargetVersion(ctx, s, clusterID, product, policy))
|
|
if targetVersion == "" {
|
|
continue
|
|
}
|
|
activeProducts = append(activeProducts, product)
|
|
parts = append(parts, product+":"+targetVersion+":"+policy.UpdatedAt.UTC().Format(time.RFC3339Nano))
|
|
}
|
|
if len(parts) == 0 {
|
|
return NodeUpdateHint{
|
|
SchemaVersion: "rap.node_update_hint.v1",
|
|
CheckNow: false,
|
|
Reason: "no_enabled_update_policy",
|
|
DeliveryMode: "update_service_subscription",
|
|
SubscriptionStatus: "subscribed",
|
|
UpdateService: updateService,
|
|
FallbackPollSeconds: 21600,
|
|
}
|
|
}
|
|
sort.Strings(parts)
|
|
sort.Strings(activeProducts)
|
|
sum := sha256.Sum256([]byte(strings.Join(parts, "|")))
|
|
return NodeUpdateHint{
|
|
SchemaVersion: "rap.node_update_hint.v1",
|
|
Generation: hex.EncodeToString(sum[:])[:16],
|
|
CheckNow: true,
|
|
Products: activeProducts,
|
|
Reason: "enabled_update_policy",
|
|
DeliveryMode: "update_service_subscription",
|
|
SubscriptionStatus: "subscribed",
|
|
UpdateService: updateService,
|
|
FallbackPollSeconds: 21600,
|
|
}
|
|
}
|
|
|
|
func (s *Service) selectNodeUpdateService(ctx context.Context, clusterID, nodeID string) *NodeUpdateServiceAssignment {
|
|
now := s.now()
|
|
assignment := &NodeUpdateServiceAssignment{
|
|
SchemaVersion: "rap.node_update_service_assignment.v1",
|
|
Status: "control_plane_fallback",
|
|
Reason: "no_healthy_update_cache_service",
|
|
AssignedAt: now,
|
|
ExpiresAt: now.Add(2 * time.Minute),
|
|
}
|
|
candidates, err := s.store.ListNodeUpdateServiceCandidates(ctx, clusterID)
|
|
if err != nil || len(candidates) == 0 {
|
|
return assignment
|
|
}
|
|
selected := candidates[0]
|
|
for _, candidate := range candidates {
|
|
if candidate.NodeID == nodeID {
|
|
selected = candidate
|
|
break
|
|
}
|
|
}
|
|
assignment.NodeID = selected.NodeID
|
|
assignment.NodeName = selected.NodeName
|
|
assignment.Endpoint = selected.Endpoint
|
|
assignment.Region = selected.Region
|
|
assignment.Status = "assigned"
|
|
assignment.Reason = "healthy_update_cache_service"
|
|
assignment.ExpiresAt = now.Add(5 * time.Minute)
|
|
return assignment
|
|
}
|
|
|
|
func updateHintTargetVersion(ctx context.Context, s *Service, clusterID, product string, policy NodeUpdatePolicy) string {
|
|
if policy.TargetVersion != nil {
|
|
return strings.TrimSpace(*policy.TargetVersion)
|
|
}
|
|
releases, err := s.store.ListReleaseVersions(ctx, clusterID, product, policy.Channel)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
for _, release := range releases {
|
|
if release.Status == "active" && strings.TrimSpace(release.Version) != "" {
|
|
return strings.TrimSpace(release.Version)
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func (s *Service) signReleaseVersion(ctx context.Context, item ReleaseVersion, actorUserID *string) (ReleaseVersion, error) {
|
|
authorityKey, err := s.ensureClusterAuthority(ctx, item.ClusterID, actorUserID)
|
|
if err != nil {
|
|
return ReleaseVersion{}, err
|
|
}
|
|
payload := map[string]any{
|
|
"schema_version": "rap.release_version_authority.v1",
|
|
"cluster_id": item.ClusterID,
|
|
"release_id": item.ID,
|
|
"product": item.Product,
|
|
"version": item.Version,
|
|
"channel": item.Channel,
|
|
"artifact_count": len(item.Artifacts),
|
|
"control_plane_only": true,
|
|
"production_forwarding": false,
|
|
}
|
|
rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now())
|
|
if err != nil {
|
|
return ReleaseVersion{}, err
|
|
}
|
|
item.AuthorityPayload = rawPayload
|
|
item.AuthoritySignature = &signature
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) signNodeUpdatePlan(ctx context.Context, plan NodeUpdatePlan) (NodeUpdatePlan, error) {
|
|
authorityKey, err := s.ensureClusterAuthority(ctx, plan.ClusterID, nil)
|
|
if err != nil {
|
|
return NodeUpdatePlan{}, err
|
|
}
|
|
payload := map[string]any{
|
|
"schema_version": "rap.node_update_plan_authority.v1",
|
|
"cluster_id": plan.ClusterID,
|
|
"node_id": plan.NodeID,
|
|
"product": plan.Product,
|
|
"current_version": plan.CurrentVersion,
|
|
"action": plan.Action,
|
|
"target_version": plan.TargetVersion,
|
|
"artifact_sha256": "",
|
|
"control_plane_only": true,
|
|
"production_forwarding": false,
|
|
}
|
|
if plan.Artifact != nil {
|
|
payload["artifact_sha256"] = plan.Artifact.SHA256
|
|
payload["artifact_url"] = plan.Artifact.URL
|
|
}
|
|
rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now())
|
|
if err != nil {
|
|
return NodeUpdatePlan{}, err
|
|
}
|
|
plan.AuthorityPayload = rawPayload
|
|
plan.AuthoritySignature = &signature
|
|
return plan, nil
|
|
}
|
|
|
|
func (s *Service) UpsertFabricTestingFlag(ctx context.Context, input UpsertFabricTestingFlagInput) (FabricTestingFlag, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return FabricTestingFlag{}, err
|
|
}
|
|
input.ScopeType = strings.TrimSpace(input.ScopeType)
|
|
if input.ScopeType == "" {
|
|
return FabricTestingFlag{}, ErrInvalidPayload
|
|
}
|
|
switch input.ScopeType {
|
|
case "platform":
|
|
input.ScopeID = nil
|
|
case "organization", "node":
|
|
if input.ScopeID == nil || strings.TrimSpace(*input.ScopeID) == "" {
|
|
return FabricTestingFlag{}, ErrInvalidPayload
|
|
}
|
|
default:
|
|
return FabricTestingFlag{}, ErrInvalidPayload
|
|
}
|
|
if input.HistoryRetentionHours <= 0 {
|
|
input.HistoryRetentionHours = 24
|
|
}
|
|
input.Metadata = defaultJSON(input.Metadata, `{}`)
|
|
if !json.Valid(input.Metadata) {
|
|
return FabricTestingFlag{}, errors.New("testing flag metadata must be valid json")
|
|
}
|
|
item, err := s.store.UpsertFabricTestingFlag(ctx, input)
|
|
if err != nil {
|
|
return FabricTestingFlag{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "fabric.testing_flag.updated",
|
|
TargetType: input.ScopeType,
|
|
TargetID: input.ScopeID,
|
|
Payload: json.RawMessage(`{"runtime_mesh_enabled":false}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) ListFabricTestingFlags(ctx context.Context, actorUserID string) ([]FabricTestingFlag, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListFabricTestingFlags(ctx)
|
|
}
|
|
|
|
func (s *Service) GetEffectiveNodeTestingFlags(ctx context.Context, clusterID, nodeID string) (EffectiveNodeTestingFlags, error) {
|
|
if clusterID == "" || nodeID == "" {
|
|
return EffectiveNodeTestingFlags{}, ErrInvalidPayload
|
|
}
|
|
return s.store.GetEffectiveNodeTestingFlags(ctx, clusterID, nodeID)
|
|
}
|
|
|
|
func (s *Service) IssueFabricServiceChannelLease(ctx context.Context, input IssueFabricServiceChannelLeaseInput) (FabricServiceChannelLease, error) {
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.OrganizationID = strings.TrimSpace(input.OrganizationID)
|
|
input.UserID = strings.TrimSpace(input.UserID)
|
|
input.ResourceID = strings.TrimSpace(input.ResourceID)
|
|
input.ServiceClass = normalizeFabricServiceClass(input.ServiceClass)
|
|
input.EntryNodeIDs = dedupeStrings(input.EntryNodeIDs)
|
|
input.ExitNodeIDs = dedupeStrings(input.ExitNodeIDs)
|
|
input.PreferredEntryNodeID = strings.TrimSpace(input.PreferredEntryNodeID)
|
|
input.PreferredExitNodeID = strings.TrimSpace(input.PreferredExitNodeID)
|
|
if input.ClusterID == "" || input.OrganizationID == "" || input.UserID == "" || input.ServiceClass == "" || len(input.EntryNodeIDs) == 0 || len(input.ExitNodeIDs) == 0 {
|
|
return FabricServiceChannelLease{}, ErrInvalidPayload
|
|
}
|
|
if !isAllowedFabricServiceClass(input.ServiceClass) {
|
|
return FabricServiceChannelLease{}, ErrInvalidPayload
|
|
}
|
|
ttl := input.TTL
|
|
if ttl <= 0 {
|
|
ttl = time.Minute
|
|
}
|
|
if ttl > 5*time.Minute {
|
|
ttl = 5 * time.Minute
|
|
}
|
|
now := s.now().UTC()
|
|
expiresAt := now.Add(ttl)
|
|
routeGeneration := "fsc-" + now.Format("20060102T150405.000000000Z")
|
|
allowedChannels := normalizeFabricServiceChannels(input.AllowedChannels, input.ServiceClass)
|
|
requiredRoles := normalizeFabricRequiredRoles(input.RequiredRoles, input.ServiceClass)
|
|
cluster, err := s.store.GetCluster(ctx, input.ClusterID)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return FabricServiceChannelLease{}, ErrInvalidCluster
|
|
}
|
|
if err != nil {
|
|
return FabricServiceChannelLease{}, err
|
|
}
|
|
poolPolicy := fabricServiceChannelPoolPolicyFromCluster(cluster)
|
|
entryNodeIDs := fabricServiceChannelEffectivePool(input.EntryNodeIDs, poolPolicy.EntryPoolNodeIDs)
|
|
exitNodeIDs := fabricServiceChannelEffectivePool(input.ExitNodeIDs, poolPolicy.ExitPoolNodeIDs)
|
|
if len(entryNodeIDs) == 0 || len(exitNodeIDs) == 0 {
|
|
return FabricServiceChannelLease{}, ErrInvalidPayload
|
|
}
|
|
selectedEntry := selectFabricServiceChannelPreferredNode(entryNodeIDs, firstNonEmptyString(poolPolicy.PreferredEntryNodeID, input.PreferredEntryNodeID))
|
|
selectedExit := selectFabricServiceChannelPreferredNode(exitNodeIDs, firstNonEmptyString(poolPolicy.PreferredExitNodeID, input.PreferredExitNodeID))
|
|
if selectedEntry == "" || selectedExit == "" {
|
|
return FabricServiceChannelLease{}, ErrInvalidPayload
|
|
}
|
|
intents, err := s.store.ListRouteIntents(ctx, input.ClusterID)
|
|
if err != nil {
|
|
return FabricServiceChannelLease{}, err
|
|
}
|
|
recoveryPolicy := s.fabricServiceChannelRecoveryPolicy(ctx, input.ClusterID)
|
|
routeProvenance := fabricServiceChannelRouteProvenanceFromIntents(intents)
|
|
feedback, err := s.fabricServiceChannelRouteFeedback(ctx, input.ClusterID, entryNodeIDs, now, recoveryPolicy, routeProvenance)
|
|
if err != nil {
|
|
return FabricServiceChannelLease{}, err
|
|
}
|
|
routes := fabricServiceChannelRoutesFromIntents(intents, input.ServiceClass, entryNodeIDs, exitNodeIDs, allowedChannels, routeGeneration, now, expiresAt, feedback, recoveryPolicy)
|
|
primary, alternates := selectFabricServicePrimaryRoute(routes, selectedEntry, selectedExit)
|
|
if primary.RouteID != "" && containsString(entryNodeIDs, primary.SourceNodeID) {
|
|
selectedEntry = primary.SourceNodeID
|
|
}
|
|
if primary.RouteID != "" && containsString(exitNodeIDs, primary.DestinationNodeID) {
|
|
selectedExit = primary.DestinationNodeID
|
|
}
|
|
fallback := FabricServiceChannelFallback{
|
|
Allowed: true,
|
|
Transport: "backend_relay",
|
|
BackendRelay: true,
|
|
Compatibility: true,
|
|
Reason: "compatibility_fallback_available",
|
|
}
|
|
fallback.Allowed = poolPolicy.BackendFallbackAllowed
|
|
fallback.BackendRelay = poolPolicy.BackendFallbackAllowed
|
|
status := FabricServiceChannelStatusReady
|
|
if primary.RouteID == "" {
|
|
if poolPolicy.BackendFallbackAllowed {
|
|
status = FabricServiceChannelStatusDegradedFallback
|
|
fallback.Active = true
|
|
fallback.Degraded = true
|
|
fallback.Reason = "no_authorized_fabric_route_for_selected_entry_exit"
|
|
} else {
|
|
status = "blocked_no_fabric_route"
|
|
fallback.Active = false
|
|
fallback.Degraded = true
|
|
fallback.Reason = "backend_fallback_disabled_by_pool_policy"
|
|
}
|
|
if fabricServiceRoutesFencedForSelectedPair(routes, selectedEntry, selectedExit) {
|
|
fallback.Reason = "fabric_route_rebuild_pending_backend_relay"
|
|
} else if fabricServiceRoutesFencedForPool(routes) {
|
|
fallback.Reason = "fabric_entry_exit_pool_rebuild_pending_backend_relay"
|
|
}
|
|
primary = FabricServiceChannelRoute{
|
|
ClusterID: input.ClusterID,
|
|
ServiceClass: input.ServiceClass,
|
|
SourceNodeID: selectedEntry,
|
|
DestinationNodeID: selectedExit,
|
|
Hops: []string{selectedEntry, selectedExit},
|
|
AllowedChannels: allowedChannels,
|
|
Generation: routeGeneration,
|
|
Status: "missing_route_intent",
|
|
RecoveryPolicy: fabricServiceChannelRecoveryPolicyRef(recoveryPolicy),
|
|
PathScore: 1,
|
|
ScoreReasons: []string{"fallback_until_fabric_route_exists"},
|
|
ExpiresAt: expiresAt,
|
|
}
|
|
} else {
|
|
fallback.Active = false
|
|
fallback.Degraded = false
|
|
}
|
|
channelID := uuidLikeRandom()
|
|
if channelID == "" {
|
|
channelID = "fabric-channel-" + now.Format("20060102T150405.000000000Z")
|
|
}
|
|
token := uuidLikeRandom()
|
|
if token == "" {
|
|
token = channelID
|
|
}
|
|
lease := FabricServiceChannelLease{
|
|
SchemaVersion: "rap.fabric_service_channel_lease.v1",
|
|
ChannelID: channelID,
|
|
ClusterID: input.ClusterID,
|
|
OrganizationID: input.OrganizationID,
|
|
UserID: input.UserID,
|
|
ResourceID: input.ResourceID,
|
|
ServiceClass: input.ServiceClass,
|
|
Status: status,
|
|
SelectedEntryNodeID: selectedEntry,
|
|
SelectedExitNodeID: selectedExit,
|
|
EntryPool: fabricServiceChannelNodePool(entryNodeIDs, "entry", selectedEntry),
|
|
ExitPool: fabricServiceChannelNodePool(exitNodeIDs, "exit", selectedExit),
|
|
RequiredRoles: requiredRoles,
|
|
AllowedChannels: allowedChannels,
|
|
PrimaryRoute: primary,
|
|
AlternateRoutes: alternates,
|
|
RecoveryPolicy: fabricServiceChannelRecoveryPolicyRef(recoveryPolicy),
|
|
PoolPolicy: fabricServiceChannelPoolPolicyRef(poolPolicy),
|
|
DataPlane: fabricServiceChannelDataPlaneContract(input.ServiceClass, poolPolicy, fallback),
|
|
QoS: defaultJSON(input.QoS, defaultFabricServiceQoS(input.ServiceClass)),
|
|
Failover: defaultJSON(input.Failover, fabricServiceFailoverFromPoolPolicy(poolPolicy)),
|
|
Fallback: fallback,
|
|
Token: FabricServiceChannelToken{
|
|
Type: "control_plane_issued_bearer",
|
|
Token: "rap_fsc_" + strings.ReplaceAll(token, "-", ""),
|
|
TTLSeconds: int(ttl.Seconds()),
|
|
IntrospectionPath: "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/introspect",
|
|
},
|
|
EntryHTTP: fabricServiceChannelHTTPIngress(input.ServiceClass),
|
|
RouteGeneration: routeGeneration,
|
|
FencingEpoch: now.UnixNano(),
|
|
IssuedAt: now,
|
|
ExpiresAt: expiresAt,
|
|
Metadata: defaultJSON(input.Metadata, `{}`),
|
|
}
|
|
if signed, err := s.signFabricServiceChannelLease(ctx, lease); err == nil {
|
|
lease = signed
|
|
}
|
|
s.rememberFabricServiceChannelLease(lease)
|
|
if _, err := s.store.StoreFabricServiceChannelLease(ctx, StoreFabricServiceChannelLeaseInput{
|
|
Lease: lease,
|
|
TokenHash: fabricServiceChannelTokenHash(lease.Token.Token),
|
|
}); err != nil {
|
|
return FabricServiceChannelLease{}, err
|
|
}
|
|
return lease, nil
|
|
}
|
|
|
|
func (s *Service) rememberFabricServiceChannelLease(lease FabricServiceChannelLease) {
|
|
if strings.TrimSpace(lease.ClusterID) == "" || strings.TrimSpace(lease.ChannelID) == "" || strings.TrimSpace(lease.Token.Token) == "" {
|
|
return
|
|
}
|
|
now := s.now()
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
s.fabricServiceChannelLeaseMu.Lock()
|
|
defer s.fabricServiceChannelLeaseMu.Unlock()
|
|
if s.fabricServiceChannelLeaseCache == nil {
|
|
s.fabricServiceChannelLeaseCache = map[string]FabricServiceChannelLease{}
|
|
}
|
|
for key, item := range s.fabricServiceChannelLeaseCache {
|
|
if !item.ExpiresAt.IsZero() && !item.ExpiresAt.After(now) {
|
|
delete(s.fabricServiceChannelLeaseCache, key)
|
|
}
|
|
}
|
|
s.fabricServiceChannelLeaseCache[fabricServiceChannelLeaseCacheKey(lease.ClusterID, lease.ChannelID)] = lease
|
|
}
|
|
|
|
func (s *Service) IntrospectFabricServiceChannelLease(ctx context.Context, input IntrospectFabricServiceChannelLeaseInput) (FabricServiceChannelLeaseIntrospection, error) {
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.ChannelID = strings.TrimSpace(input.ChannelID)
|
|
input.ResourceID = strings.TrimSpace(input.ResourceID)
|
|
input.ServiceClass = normalizeFabricServiceClass(input.ServiceClass)
|
|
input.ChannelClass = strings.TrimSpace(strings.ToLower(input.ChannelClass))
|
|
input.Token = strings.TrimSpace(input.Token)
|
|
input.EntryNodeID = strings.TrimSpace(input.EntryNodeID)
|
|
if input.ClusterID == "" || input.ChannelID == "" || input.Token == "" {
|
|
return FabricServiceChannelLeaseIntrospection{}, ErrInvalidPayload
|
|
}
|
|
now := s.now()
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
s.fabricServiceChannelLeaseMu.Lock()
|
|
lease, ok := s.fabricServiceChannelLeaseCache[fabricServiceChannelLeaseCacheKey(input.ClusterID, input.ChannelID)]
|
|
tokenHash := ""
|
|
if ok && !lease.ExpiresAt.IsZero() && !lease.ExpiresAt.After(now) {
|
|
delete(s.fabricServiceChannelLeaseCache, fabricServiceChannelLeaseCacheKey(input.ClusterID, input.ChannelID))
|
|
ok = false
|
|
}
|
|
if ok {
|
|
tokenHash = fabricServiceChannelTokenHash(lease.Token.Token)
|
|
}
|
|
s.fabricServiceChannelLeaseMu.Unlock()
|
|
if !ok {
|
|
record, err := s.store.GetFabricServiceChannelLease(ctx, input.ClusterID, input.ChannelID)
|
|
if err != nil && !errors.Is(err, pgx.ErrNoRows) {
|
|
return FabricServiceChannelLeaseIntrospection{}, err
|
|
}
|
|
if err == nil {
|
|
lease = record.Lease
|
|
tokenHash = strings.TrimSpace(record.TokenHash)
|
|
if !lease.ExpiresAt.IsZero() && !lease.ExpiresAt.After(now) {
|
|
ok = false
|
|
} else {
|
|
ok = true
|
|
s.rememberFabricServiceChannelLease(lease)
|
|
}
|
|
}
|
|
}
|
|
out := FabricServiceChannelLeaseIntrospection{
|
|
SchemaVersion: "rap.fabric_service_channel_introspection.v1",
|
|
ClusterID: input.ClusterID,
|
|
ChannelID: input.ChannelID,
|
|
ResourceID: input.ResourceID,
|
|
ServiceClass: input.ServiceClass,
|
|
AcceptedBy: "introspection",
|
|
Status: "denied",
|
|
Reason: "lease_not_found",
|
|
}
|
|
if !ok {
|
|
return out, nil
|
|
}
|
|
out.ResourceID = lease.ResourceID
|
|
out.ServiceClass = lease.ServiceClass
|
|
out.SelectedEntryNodeID = lease.SelectedEntryNodeID
|
|
out.SelectedExitNodeID = lease.SelectedExitNodeID
|
|
out.AllowedChannels = append([]string{}, lease.AllowedChannels...)
|
|
out.LeaseStatus = lease.Status
|
|
out.PrimaryRoute = lease.PrimaryRoute
|
|
out.DataPlane = lease.DataPlane
|
|
out.RouteGeneration = lease.RouteGeneration
|
|
out.FencingEpoch = lease.FencingEpoch
|
|
out.ExpiresAt = lease.ExpiresAt
|
|
if lease.ClusterID != input.ClusterID ||
|
|
lease.ChannelID != input.ChannelID ||
|
|
tokenHash == "" ||
|
|
tokenHash != fabricServiceChannelTokenHash(input.Token) {
|
|
out.Reason = "lease_token_mismatch"
|
|
return out, nil
|
|
}
|
|
if lease.ResourceID != "" && input.ResourceID != "" && lease.ResourceID != input.ResourceID {
|
|
out.Reason = "resource_mismatch"
|
|
return out, nil
|
|
}
|
|
if input.ServiceClass != "" && lease.ServiceClass != input.ServiceClass {
|
|
out.Reason = "service_class_mismatch"
|
|
return out, nil
|
|
}
|
|
if input.ChannelClass != "" && !containsString(lease.AllowedChannels, input.ChannelClass) {
|
|
out.Reason = "channel_class_not_allowed"
|
|
return out, nil
|
|
}
|
|
if input.EntryNodeID != "" && lease.SelectedEntryNodeID != "" && lease.SelectedEntryNodeID != input.EntryNodeID {
|
|
out.Reason = "entry_node_mismatch"
|
|
return out, nil
|
|
}
|
|
out.Allowed = true
|
|
out.Status = "allowed"
|
|
out.Reason = "lease_introspection_allowed"
|
|
if lease.Status == FabricServiceChannelStatusDegradedFallback || lease.PrimaryRoute.Status == "missing_route_intent" {
|
|
out.ForceBackendFallback = true
|
|
} else {
|
|
out.PreferredRouteID = strings.TrimSpace(lease.PrimaryRoute.RouteID)
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func (s *Service) ListFabricServiceChannelLeases(ctx context.Context, actorUserID string, input ListFabricServiceChannelLeasesInput) (FabricServiceChannelLeaseMaintenance, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return FabricServiceChannelLeaseMaintenance{}, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.ServiceClass = normalizeFabricServiceClass(input.ServiceClass)
|
|
input.EntryNodeID = strings.TrimSpace(input.EntryNodeID)
|
|
input.ResourceID = strings.TrimSpace(input.ResourceID)
|
|
if input.ClusterID == "" {
|
|
return FabricServiceChannelLeaseMaintenance{}, ErrInvalidPayload
|
|
}
|
|
if input.Limit <= 0 || input.Limit > 500 {
|
|
input.Limit = 100
|
|
}
|
|
now := input.Now
|
|
if now.IsZero() {
|
|
now = s.now()
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
records, err := s.store.ListFabricServiceChannelLeases(ctx, input)
|
|
if err != nil {
|
|
return FabricServiceChannelLeaseMaintenance{}, err
|
|
}
|
|
out := FabricServiceChannelLeaseMaintenance{
|
|
SchemaVersion: "rap.fabric_service_channel_lease_maintenance.v1",
|
|
ClusterID: input.ClusterID,
|
|
Status: "ready",
|
|
Reason: "lease_maintenance_ready",
|
|
ObservedAt: now.UTC(),
|
|
WindowLimit: input.Limit,
|
|
}
|
|
for _, record := range records {
|
|
summary := fabricServiceChannelLeaseSummaryFromRecord(record, now)
|
|
if summary.Expired {
|
|
out.ExpiredCount++
|
|
} else {
|
|
out.ActiveCount++
|
|
}
|
|
out.Leases = append(out.Leases, summary)
|
|
}
|
|
out.ScannedCount = len(out.Leases)
|
|
if out.ExpiredCount > 0 {
|
|
out.Status = "degraded"
|
|
out.Reason = "expired_leases_pending_cleanup"
|
|
out.RecommendedOperatorAction = "Run service-channel lease cleanup to remove expired compatibility lease records."
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func (s *Service) CleanupFabricServiceChannelLeases(ctx context.Context, input CleanupFabricServiceChannelLeasesInput) (FabricServiceChannelLeaseMaintenance, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return FabricServiceChannelLeaseMaintenance{}, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
if input.ClusterID == "" {
|
|
return FabricServiceChannelLeaseMaintenance{}, ErrInvalidPayload
|
|
}
|
|
if input.Limit <= 0 || input.Limit > 1000 {
|
|
input.Limit = 100
|
|
}
|
|
now := input.Now
|
|
if now.IsZero() {
|
|
now = s.now()
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
deleted, err := s.store.CleanupExpiredFabricServiceChannelLeases(ctx, input.ClusterID, now.UTC(), input.Limit)
|
|
if err != nil {
|
|
return FabricServiceChannelLeaseMaintenance{}, err
|
|
}
|
|
out, err := s.ListFabricServiceChannelLeases(ctx, input.ActorUserID, ListFabricServiceChannelLeasesInput{
|
|
ClusterID: input.ClusterID,
|
|
IncludeExpired: true,
|
|
Limit: input.Limit,
|
|
Now: now,
|
|
})
|
|
if err != nil {
|
|
return FabricServiceChannelLeaseMaintenance{}, err
|
|
}
|
|
out.DeletedExpiredCount = deleted
|
|
out.Status = "ready"
|
|
out.Reason = "expired_leases_cleaned"
|
|
out.RecommendedOperatorAction = ""
|
|
if out.ExpiredCount > 0 {
|
|
out.Status = "degraded"
|
|
out.Reason = "expired_leases_remaining"
|
|
out.RecommendedOperatorAction = "Run cleanup again; expired leases remain beyond the bounded cleanup window."
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func (s *Service) GetFabricServiceChannelAccessTelemetry(ctx context.Context, actorUserID string, input GetFabricServiceChannelAccessTelemetryInput) (FabricServiceChannelAccessTelemetry, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return FabricServiceChannelAccessTelemetry{}, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
if input.ClusterID == "" {
|
|
return FabricServiceChannelAccessTelemetry{}, ErrInvalidPayload
|
|
}
|
|
if input.Limit <= 0 || input.Limit > 200 {
|
|
input.Limit = 100
|
|
}
|
|
now := input.Now
|
|
if now.IsZero() {
|
|
now = s.now()
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
nodes, err := s.store.ListClusterNodes(ctx, input.ClusterID)
|
|
if err != nil {
|
|
return FabricServiceChannelAccessTelemetry{}, err
|
|
}
|
|
out := FabricServiceChannelAccessTelemetry{
|
|
SchemaVersion: "rap.fabric_service_channel_access_telemetry.v1",
|
|
ClusterID: input.ClusterID,
|
|
Status: "ready",
|
|
Reason: "access_telemetry_ready",
|
|
ObservedAt: now.UTC(),
|
|
NodeCount: len(nodes),
|
|
TrafficClassCounts: map[string]int{},
|
|
RecommendedParallelWindows: map[string]int{},
|
|
}
|
|
for _, node := range nodes {
|
|
if len(out.Nodes) >= input.Limit {
|
|
break
|
|
}
|
|
items, err := s.store.ListNodeTelemetry(ctx, input.ClusterID, node.ID, 5)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
report := map[string]any{}
|
|
var observedAt time.Time
|
|
for _, item := range items {
|
|
payload := jsonObject(item.Payload)
|
|
report = jsonMapPath(payload, "fabric_service_channel_access_report")
|
|
if len(report) > 0 {
|
|
observedAt = item.ObservedAt
|
|
break
|
|
}
|
|
}
|
|
if len(report) == 0 {
|
|
heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, node.ID, 5)
|
|
if err == nil {
|
|
for _, heartbeat := range heartbeats {
|
|
payload := jsonObject(heartbeat.Metadata)
|
|
report = jsonMapPath(payload, "fabric_service_channel_access_report")
|
|
if len(report) > 0 {
|
|
observedAt = heartbeat.ObservedAt
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if len(report) == 0 {
|
|
continue
|
|
}
|
|
nodeReport := FabricServiceChannelAccessTelemetryNode{
|
|
NodeID: node.ID,
|
|
NodeName: node.Name,
|
|
ObservedAt: observedAt,
|
|
TotalAccepted: jsonInt(report, "total"),
|
|
SignedAccepted: jsonInt(report, "signed"),
|
|
IntrospectionAccepted: jsonInt(report, "introspection"),
|
|
LegacyUnsignedAccepted: jsonInt(report, "legacy_unsigned"),
|
|
BackendFallbackCount: jsonInt(report, "backend_fallback"),
|
|
BackendFallbackBlockedCount: jsonInt(report, "backend_fallback_blocked"),
|
|
FabricRouteSendFailureCount: jsonInt(report, "fabric_route_send_failure"),
|
|
DataPlaneContractCount: jsonInt(report, "data_plane_contract"),
|
|
LastDataPlaneMode: jsonString(report, "last_data_plane_mode"),
|
|
LastWorkingDataTransport: jsonString(report, "last_working_data_transport"),
|
|
LastSteadyStateTransport: jsonString(report, "last_steady_state_transport"),
|
|
LastBackendRelayPolicy: jsonString(report, "last_backend_relay_policy"),
|
|
LastLogicalFlowMode: jsonString(report, "last_logical_flow_mode"),
|
|
LastDataPlaneViolationStatus: jsonString(report, "last_data_plane_violation_status"),
|
|
LastDataPlaneViolationReason: jsonString(report, "last_data_plane_violation_reason"),
|
|
}
|
|
if nodeReport.SignedAccepted == 0 {
|
|
nodeReport.SignedAccepted = jsonInt(report, "accepted_by_signed")
|
|
}
|
|
if nodeReport.IntrospectionAccepted == 0 {
|
|
nodeReport.IntrospectionAccepted = jsonInt(report, "accepted_by_introspection")
|
|
}
|
|
if nodeReport.LegacyUnsignedAccepted == 0 {
|
|
nodeReport.LegacyUnsignedAccepted = jsonInt(report, "accepted_by_legacy_unsigned")
|
|
}
|
|
if value := jsonString(report, "last_accepted_at"); value != "" {
|
|
if parsed, err := time.Parse(time.RFC3339Nano, value); err == nil {
|
|
nodeReport.LastAcceptedAt = &parsed
|
|
if out.LatestAcceptedAt == nil || parsed.After(*out.LatestAcceptedAt) {
|
|
latest := parsed
|
|
out.LatestAcceptedAt = &latest
|
|
}
|
|
}
|
|
}
|
|
if heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, node.ID, 1); err == nil && len(heartbeats) > 0 {
|
|
flowScheduler := fabricServiceChannelFlowSchedulerFromHeartbeat(heartbeats[0])
|
|
nodeReport.TrafficClassCounts = jsonStringIntMap(flowScheduler, "traffic_class_counts")
|
|
nodeReport.FlowChannelCount = jsonInt(flowScheduler, "channel_count")
|
|
nodeReport.FlowDropped = jsonInt(flowScheduler, "dropped")
|
|
nodeReport.FlowHighWatermark = jsonInt(flowScheduler, "high_watermark")
|
|
nodeReport.FlowMaxInFlight = jsonInt(flowScheduler, "max_in_flight")
|
|
nodeReport.RecommendedParallelWindows = jsonStringIntMap(flowScheduler, "recommended_parallel_windows")
|
|
nodeReport.AdaptiveBackpressureActive = jsonBool(flowScheduler, "adaptive_backpressure_active")
|
|
nodeReport.AdaptiveBackpressureReason = jsonString(flowScheduler, "adaptive_backpressure_reason")
|
|
nodeReport.AdaptivePolicyFingerprint = jsonString(flowScheduler, "adaptive_policy_fingerprint")
|
|
}
|
|
nodeReport.FlowHealthStatus, nodeReport.FlowHealthReason, _ = fabricServiceChannelFlowHealth(
|
|
nodeReport.TrafficClassCounts,
|
|
nodeReport.FlowDropped,
|
|
nodeReport.FlowHighWatermark,
|
|
nodeReport.FlowMaxInFlight,
|
|
nodeReport.BackendFallbackCount,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
)
|
|
out.ReportingNodeCount++
|
|
out.TotalAccepted += nodeReport.TotalAccepted
|
|
out.SignedAccepted += nodeReport.SignedAccepted
|
|
out.IntrospectionAccepted += nodeReport.IntrospectionAccepted
|
|
out.LegacyUnsignedAccepted += nodeReport.LegacyUnsignedAccepted
|
|
out.BackendFallbackCount += nodeReport.BackendFallbackCount
|
|
out.BackendFallbackBlockedCount += nodeReport.BackendFallbackBlockedCount
|
|
out.FabricRouteSendFailureCount += nodeReport.FabricRouteSendFailureCount
|
|
out.DataPlaneContractCount += nodeReport.DataPlaneContractCount
|
|
if out.LastDataPlaneMode == "" {
|
|
out.LastDataPlaneMode = nodeReport.LastDataPlaneMode
|
|
}
|
|
if out.LastWorkingDataTransport == "" {
|
|
out.LastWorkingDataTransport = nodeReport.LastWorkingDataTransport
|
|
}
|
|
if out.LastSteadyStateTransport == "" {
|
|
out.LastSteadyStateTransport = nodeReport.LastSteadyStateTransport
|
|
}
|
|
if out.LastBackendRelayPolicy == "" {
|
|
out.LastBackendRelayPolicy = nodeReport.LastBackendRelayPolicy
|
|
}
|
|
if out.LastLogicalFlowMode == "" {
|
|
out.LastLogicalFlowMode = nodeReport.LastLogicalFlowMode
|
|
}
|
|
if out.LastDataPlaneViolationStatus == "" {
|
|
out.LastDataPlaneViolationStatus = nodeReport.LastDataPlaneViolationStatus
|
|
}
|
|
if out.LastDataPlaneViolationReason == "" {
|
|
out.LastDataPlaneViolationReason = nodeReport.LastDataPlaneViolationReason
|
|
}
|
|
mergeStringIntMap(out.TrafficClassCounts, nodeReport.TrafficClassCounts)
|
|
mergeMinStringIntMap(out.RecommendedParallelWindows, nodeReport.RecommendedParallelWindows)
|
|
if nodeReport.AdaptiveBackpressureActive {
|
|
out.AdaptiveBackpressureActive = true
|
|
if out.AdaptiveBackpressureReason == "" {
|
|
out.AdaptiveBackpressureReason = nodeReport.AdaptiveBackpressureReason
|
|
}
|
|
}
|
|
if out.AdaptivePolicyFingerprint == "" {
|
|
out.AdaptivePolicyFingerprint = nodeReport.AdaptivePolicyFingerprint
|
|
}
|
|
out.FlowChannelCount += nodeReport.FlowChannelCount
|
|
out.FlowDropped += nodeReport.FlowDropped
|
|
if nodeReport.FlowHighWatermark > out.FlowHighWatermark {
|
|
out.FlowHighWatermark = nodeReport.FlowHighWatermark
|
|
}
|
|
if nodeReport.FlowMaxInFlight > out.FlowMaxInFlight {
|
|
out.FlowMaxInFlight = nodeReport.FlowMaxInFlight
|
|
}
|
|
out.Nodes = append(out.Nodes, nodeReport)
|
|
}
|
|
if len(out.TrafficClassCounts) == 0 {
|
|
out.TrafficClassCounts = nil
|
|
}
|
|
if len(out.RecommendedParallelWindows) == 0 {
|
|
out.RecommendedParallelWindows = nil
|
|
}
|
|
nodeReportsByID := map[string]FabricServiceChannelAccessTelemetryNode{}
|
|
for _, node := range out.Nodes {
|
|
nodeReportsByID[node.NodeID] = node
|
|
}
|
|
routeManagerByNodeID := map[string]map[string]any{}
|
|
routeManagerTransitionByNodeID := map[string]map[string]any{}
|
|
for _, node := range nodes {
|
|
heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, node.ID, 1)
|
|
if err != nil || len(heartbeats) == 0 {
|
|
continue
|
|
}
|
|
metadata := jsonObject(heartbeats[0].Metadata)
|
|
runtime := jsonMapPath(metadata, "fabric_service_channel_runtime_report")
|
|
ingress := jsonMapPath(runtime, "ingress")
|
|
routeManager := jsonMapPath(ingress, "route_manager")
|
|
if len(routeManager) > 0 {
|
|
routeManagerByNodeID[node.ID] = routeManager
|
|
}
|
|
transition := jsonMapPath(ingress, "route_manager_transition")
|
|
if len(transition) > 0 {
|
|
routeManagerTransitionByNodeID[node.ID] = transition
|
|
}
|
|
}
|
|
feedbackItems, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{
|
|
ClusterID: input.ClusterID,
|
|
ServiceClass: FabricServiceClassVPNPackets,
|
|
Now: now.UTC(),
|
|
IncludeExpired: false,
|
|
})
|
|
if err != nil {
|
|
return FabricServiceChannelAccessTelemetry{}, err
|
|
}
|
|
feedbackByRouteID := map[string]FabricServiceChannelRouteFeedbackObservation{}
|
|
for _, item := range feedbackItems {
|
|
if strings.TrimSpace(item.RouteID) == "" {
|
|
continue
|
|
}
|
|
current, ok := feedbackByRouteID[item.RouteID]
|
|
if !ok || item.ObservedAt.After(current.ObservedAt) {
|
|
feedbackByRouteID[item.RouteID] = item
|
|
}
|
|
}
|
|
leaseRecords, err := s.store.ListFabricServiceChannelLeases(ctx, ListFabricServiceChannelLeasesInput{
|
|
ClusterID: input.ClusterID,
|
|
IncludeExpired: false,
|
|
Limit: input.Limit,
|
|
Now: now.UTC(),
|
|
})
|
|
if err != nil {
|
|
return FabricServiceChannelAccessTelemetry{}, err
|
|
}
|
|
for _, record := range leaseRecords {
|
|
summary := fabricServiceChannelLeaseSummaryFromRecord(record, now)
|
|
channel := FabricServiceChannelAccessTelemetryChannel{
|
|
ChannelID: summary.ChannelID,
|
|
ResourceID: summary.ResourceID,
|
|
ServiceClass: summary.ServiceClass,
|
|
Status: summary.Status,
|
|
SelectedEntryNodeID: summary.SelectedEntryNodeID,
|
|
SelectedExitNodeID: summary.SelectedExitNodeID,
|
|
PrimaryRouteID: summary.PrimaryRouteID,
|
|
PrimaryRouteStatus: summary.PrimaryRouteStatus,
|
|
ForceBackendFallback: summary.ForceBackendFallback,
|
|
DataPlane: summary.DataPlane,
|
|
ExpiresAt: summary.ExpiresAt,
|
|
}
|
|
if record.Lease.PoolPolicy != nil {
|
|
channel.PoolPolicyFingerprint = record.Lease.PoolPolicy.Fingerprint
|
|
}
|
|
if entryReport, ok := nodeReportsByID[channel.SelectedEntryNodeID]; ok {
|
|
channel.EntryNodeTotalAccepted = entryReport.TotalAccepted
|
|
channel.EntryNodeIntrospectionAccepted = entryReport.IntrospectionAccepted
|
|
channel.EntryNodeBackendFallbackCount = entryReport.BackendFallbackCount
|
|
channel.EntryNodeBackendFallbackBlockedCount = entryReport.BackendFallbackBlockedCount
|
|
channel.EntryNodeFabricRouteSendFailureCount = entryReport.FabricRouteSendFailureCount
|
|
channel.EntryNodeDataPlaneContractCount = entryReport.DataPlaneContractCount
|
|
channel.EntryNodeLastDataPlaneMode = entryReport.LastDataPlaneMode
|
|
channel.EntryNodeLastWorkingDataTransport = entryReport.LastWorkingDataTransport
|
|
channel.EntryNodeLastSteadyStateTransport = entryReport.LastSteadyStateTransport
|
|
channel.EntryNodeLastBackendRelayPolicy = entryReport.LastBackendRelayPolicy
|
|
channel.EntryNodeLastLogicalFlowMode = entryReport.LastLogicalFlowMode
|
|
channel.EntryNodeLastDataPlaneViolationStatus = entryReport.LastDataPlaneViolationStatus
|
|
channel.EntryNodeLastDataPlaneViolationReason = entryReport.LastDataPlaneViolationReason
|
|
channel.EntryNodeTrafficClassCounts = copyStringIntMap(entryReport.TrafficClassCounts)
|
|
channel.EntryNodeFlowChannelCount = entryReport.FlowChannelCount
|
|
channel.EntryNodeFlowDropped = entryReport.FlowDropped
|
|
channel.EntryNodeFlowHighWatermark = entryReport.FlowHighWatermark
|
|
channel.EntryNodeFlowMaxInFlight = entryReport.FlowMaxInFlight
|
|
channel.EntryNodeFlowHealthStatus = entryReport.FlowHealthStatus
|
|
channel.EntryNodeFlowHealthReason = entryReport.FlowHealthReason
|
|
channel.EntryNodeRecommendedParallelWindows = copyStringIntMap(entryReport.RecommendedParallelWindows)
|
|
channel.EntryNodeAdaptiveBackpressureActive = entryReport.AdaptiveBackpressureActive
|
|
channel.EntryNodeAdaptiveBackpressureReason = entryReport.AdaptiveBackpressureReason
|
|
channel.EntryNodeAdaptivePolicyFingerprint = entryReport.AdaptivePolicyFingerprint
|
|
}
|
|
if feedback, ok := feedbackByRouteID[channel.PrimaryRouteID]; ok {
|
|
observedAt := feedback.ObservedAt
|
|
channel.RouteFeedbackStatus = feedback.FeedbackStatus
|
|
channel.RouteFeedbackObservedAt = &observedAt
|
|
channel.RouteFeedbackScoreAdjustment = feedback.ScoreAdjustment
|
|
channel.RouteFeedbackEffectiveScoreAdjustment = feedback.EffectiveScoreAdjustment
|
|
channel.RouteFeedbackReasons = append([]string{}, feedback.Reasons...)
|
|
channel.RouteQualityWindowSampleCount = fabricServiceChannelFeedbackPayloadInt(feedback.Payload, "quality_window_sample_count")
|
|
channel.RouteQualityWindowFailureCount = fabricServiceChannelFeedbackPayloadInt(feedback.Payload, "quality_window_failure_count")
|
|
channel.RouteQualityWindowDropCount = fabricServiceChannelFeedbackPayloadInt(feedback.Payload, "quality_window_drop_count")
|
|
channel.RouteQualityWindowSlowCount = fabricServiceChannelFeedbackPayloadInt(feedback.Payload, "quality_window_slow_count")
|
|
channel.LastSendDurationMs = feedback.LastSendDurationMs
|
|
channel.EntryNodeFlowHealthStatus, channel.EntryNodeFlowHealthReason, _ = fabricServiceChannelFlowHealth(
|
|
channel.EntryNodeTrafficClassCounts,
|
|
channel.EntryNodeFlowDropped,
|
|
channel.EntryNodeFlowHighWatermark,
|
|
channel.EntryNodeFlowMaxInFlight,
|
|
channel.EntryNodeBackendFallbackCount,
|
|
channel.LastSendDurationMs,
|
|
channel.RouteQualityWindowFailureCount,
|
|
channel.RouteQualityWindowDropCount,
|
|
channel.RouteQualityWindowSlowCount,
|
|
)
|
|
out.CorrelatedRouteCount++
|
|
if feedback.FeedbackStatus == "degraded" || feedback.FeedbackStatus == "fenced" || feedback.EffectiveScoreAdjustment < 0 || feedback.ScoreAdjustment < 0 {
|
|
out.DegradedRouteCount++
|
|
}
|
|
}
|
|
channel = fabricServiceChannelAccessRemediation(channel, record.Lease, now)
|
|
channel = fabricServiceChannelAccessRouteDecisionTelemetry(channel, routeManagerByNodeID[channel.SelectedEntryNodeID], routeManagerTransitionByNodeID[channel.SelectedEntryNodeID])
|
|
channel = fabricServiceChannelAccessRemediationExecution(channel, routeManagerByNodeID[channel.SelectedEntryNodeID], routeManagerTransitionByNodeID[channel.SelectedEntryNodeID], now)
|
|
channel = s.fabricServiceChannelAccessRemediationLedgerExecution(ctx, input.ClusterID, channel)
|
|
fabricServiceChannelAccumulateRouteDecisionTelemetry(&out, channel)
|
|
if channel.ForceBackendFallback {
|
|
out.DegradedFallbackChannelCount++
|
|
}
|
|
out.ActiveChannels = append(out.ActiveChannels, channel)
|
|
}
|
|
out.ActiveChannelCount = len(out.ActiveChannels)
|
|
sort.Slice(out.Nodes, func(i, j int) bool {
|
|
if out.Nodes[i].TotalAccepted != out.Nodes[j].TotalAccepted {
|
|
return out.Nodes[i].TotalAccepted > out.Nodes[j].TotalAccepted
|
|
}
|
|
return out.Nodes[i].NodeName < out.Nodes[j].NodeName
|
|
})
|
|
sort.Slice(out.ActiveChannels, func(i, j int) bool {
|
|
if out.ActiveChannels[i].ForceBackendFallback != out.ActiveChannels[j].ForceBackendFallback {
|
|
return out.ActiveChannels[i].ForceBackendFallback
|
|
}
|
|
if out.ActiveChannels[i].RouteFeedbackStatus != out.ActiveChannels[j].RouteFeedbackStatus {
|
|
return out.ActiveChannels[i].RouteFeedbackStatus > out.ActiveChannels[j].RouteFeedbackStatus
|
|
}
|
|
return out.ActiveChannels[i].ExpiresAt.Before(out.ActiveChannels[j].ExpiresAt)
|
|
})
|
|
if out.NoSafeRecoveryDecisionCount > 0 {
|
|
out.Status = "degraded"
|
|
out.Reason = "active_channels_no_safe_recovery"
|
|
out.RecommendedOperatorAction = "Inspect active service-channel route decisions; at least one channel has no safe recovery route."
|
|
} else if out.ReportingNodeCount == 0 {
|
|
out.Status = "degraded"
|
|
out.Reason = "no_access_telemetry_reported"
|
|
out.RecommendedOperatorAction = "Wait for node telemetry or verify fabric_service_channel_access_telemetry capability on node-agent."
|
|
} else if out.DegradedFallbackChannelCount > 0 || out.DegradedRouteCount > 0 {
|
|
out.Status = "degraded"
|
|
out.Reason = "active_channels_degraded"
|
|
out.RecommendedOperatorAction = "Inspect active service-channel routes with backend fallback or degraded route-quality feedback."
|
|
}
|
|
out.FlowHealthStatus, out.FlowHealthReason, _ = fabricServiceChannelFlowHealth(
|
|
out.TrafficClassCounts,
|
|
out.FlowDropped,
|
|
out.FlowHighWatermark,
|
|
out.FlowMaxInFlight,
|
|
out.BackendFallbackCount,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
)
|
|
for _, channel := range out.ActiveChannels {
|
|
out.FlowHealthStatus, out.FlowHealthReason = fabricServiceChannelWorseFlowHealth(out.FlowHealthStatus, out.FlowHealthReason, channel.EntryNodeFlowHealthStatus, channel.EntryNodeFlowHealthReason)
|
|
}
|
|
if out.FlowHealthStatus == "critical" || out.FlowHealthStatus == "degraded" {
|
|
out.Status = "degraded"
|
|
if out.Reason == "access_telemetry_ready" {
|
|
out.Reason = "flow_health_degraded"
|
|
}
|
|
if out.RecommendedOperatorAction == "" {
|
|
out.RecommendedOperatorAction = fabricServiceChannelFlowHealthAction(out.FlowHealthStatus, out.FlowHealthReason)
|
|
}
|
|
} else if out.FlowHealthStatus == "watch" && out.RecommendedOperatorAction == "" {
|
|
out.RecommendedOperatorAction = fabricServiceChannelFlowHealthAction(out.FlowHealthStatus, out.FlowHealthReason)
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func fabricServiceChannelFlowHealth(trafficClassCounts map[string]int, flowDropped, flowHighWatermark, flowMaxInFlight, backendFallbackCount int, lastSendDurationMs int64, routeFailureCount, routeDropCount, routeSlowCount int) (string, string, string) {
|
|
switch {
|
|
case flowDropped > 0:
|
|
return "critical", "flow_drops_reported", fabricServiceChannelFlowHealthAction("critical", "flow_drops_reported")
|
|
case routeDropCount > 0:
|
|
return "critical", "route_quality_window_drops_reported", fabricServiceChannelFlowHealthAction("critical", "route_quality_window_drops_reported")
|
|
case backendFallbackCount > 0:
|
|
return "degraded", "backend_fallback_observed", fabricServiceChannelFlowHealthAction("degraded", "backend_fallback_observed")
|
|
case routeFailureCount > 0:
|
|
return "degraded", "route_quality_window_failures_reported", fabricServiceChannelFlowHealthAction("degraded", "route_quality_window_failures_reported")
|
|
case routeSlowCount > 0:
|
|
return "degraded", "route_quality_window_slow_samples_reported", fabricServiceChannelFlowHealthAction("degraded", "route_quality_window_slow_samples_reported")
|
|
case lastSendDurationMs >= 1000:
|
|
return "degraded", "route_send_latency_high", fabricServiceChannelFlowHealthAction("degraded", "route_send_latency_high")
|
|
}
|
|
bulk := trafficClassCounts["bulk"]
|
|
interactive := trafficClassCounts["interactive"] + trafficClassCounts["control"]
|
|
switch {
|
|
case flowHighWatermark >= 64 || flowMaxInFlight >= 16:
|
|
return "degraded", "flow_queue_pressure_high", fabricServiceChannelFlowHealthAction("degraded", "flow_queue_pressure_high")
|
|
case bulk >= 16 && interactive > 0:
|
|
return "watch", "bulk_pressure_with_interactive_qos_observed", fabricServiceChannelFlowHealthAction("watch", "bulk_pressure_with_interactive_qos_observed")
|
|
case bulk >= 16:
|
|
return "watch", "bulk_pressure_observed", fabricServiceChannelFlowHealthAction("watch", "bulk_pressure_observed")
|
|
case flowHighWatermark >= 16 || flowMaxInFlight >= 4:
|
|
return "watch", "flow_queue_pressure_observed", fabricServiceChannelFlowHealthAction("watch", "flow_queue_pressure_observed")
|
|
default:
|
|
return "healthy", "flow_health_ready", fabricServiceChannelFlowHealthAction("healthy", "flow_health_ready")
|
|
}
|
|
}
|
|
|
|
func fabricServiceChannelWorseFlowHealth(currentStatus, currentReason, candidateStatus, candidateReason string) (string, string) {
|
|
if candidateStatus == "" {
|
|
return currentStatus, currentReason
|
|
}
|
|
if fabricServiceChannelFlowHealthRank(candidateStatus) > fabricServiceChannelFlowHealthRank(currentStatus) {
|
|
return candidateStatus, candidateReason
|
|
}
|
|
return currentStatus, currentReason
|
|
}
|
|
|
|
func fabricServiceChannelFlowHealthRank(status string) int {
|
|
switch status {
|
|
case "critical":
|
|
return 4
|
|
case "degraded":
|
|
return 3
|
|
case "watch":
|
|
return 2
|
|
case "healthy":
|
|
return 1
|
|
default:
|
|
return 0
|
|
}
|
|
}
|
|
|
|
func fabricServiceChannelFlowHealthAction(status, reason string) string {
|
|
switch status {
|
|
case "critical":
|
|
return "Reduce or reroute service-channel pressure immediately; inspect flow drops, route drops, and backend fallback before adding user traffic."
|
|
case "degraded":
|
|
return "Inspect service-channel route quality and active entry-node pressure; prefer alternate route or rebuild when degraded evidence persists."
|
|
case "watch":
|
|
if reason == "bulk_pressure_with_interactive_qos_observed" {
|
|
return "Bulk pressure is active while interactive/control remains observable; keep watching latency and drops before increasing load."
|
|
}
|
|
return "Bulk or queue pressure is visible; monitor interactive/control traffic before increasing production load."
|
|
default:
|
|
return "Flow health is within the current service-channel guard policy."
|
|
}
|
|
}
|
|
|
|
func fabricServiceChannelAccessRemediation(channel FabricServiceChannelAccessTelemetryChannel, lease FabricServiceChannelLease, now time.Time) FabricServiceChannelAccessTelemetryChannel {
|
|
if channel.ForceBackendFallback {
|
|
channel.RemediationAction = "use_backend_fallback"
|
|
channel.RemediationReason = "explicit_backend_fallback_active"
|
|
channel.RecommendedOperatorAction = "Inspect missing/fenced fabric route and keep backend fallback visible until a normal route is available."
|
|
channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now)
|
|
return channel
|
|
}
|
|
degraded := channel.RouteFeedbackStatus == "degraded" || channel.RouteFeedbackStatus == "fenced" ||
|
|
channel.RouteFeedbackScoreAdjustment < 0 || channel.RouteFeedbackEffectiveScoreAdjustment < 0
|
|
if !degraded {
|
|
channel.RemediationAction = "none"
|
|
channel.RemediationReason = "active_route_quality_acceptable"
|
|
channel.RecommendedOperatorAction = "No route remediation required."
|
|
return channel
|
|
}
|
|
if containsString(channel.RouteFeedbackReasons, "service_channel_degraded_fallback_recommended") {
|
|
channel.RemediationAction = "use_backend_fallback"
|
|
channel.RemediationReason = "route_feedback_recommends_degraded_fallback"
|
|
channel.RecommendedOperatorAction = "Use explicit degraded backend fallback while route rebuild catches up."
|
|
channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now)
|
|
return channel
|
|
}
|
|
if alternate, ok := fabricServiceChannelFirstAuthorizedAlternate(lease.AlternateRoutes, channel.PrimaryRouteID); ok {
|
|
guardStatus, guardReason := fabricServiceChannelRouteAllowedByLeasePool(lease, alternate)
|
|
if guardStatus != "allowed" {
|
|
channel.RemediationAction = "rebuild_route"
|
|
channel.RemediationReason = "alternate_route_rejected_by_pool_policy"
|
|
channel.RemediationRouteID = alternate.RouteID
|
|
channel.RemediationRouteStatus = alternate.Status
|
|
channel.RemediationGuardStatus = guardStatus
|
|
channel.RemediationGuardReason = guardReason
|
|
channel.RecommendedOperatorAction = "Reject the alternate route and rebuild within the signed entry/exit pool policy."
|
|
channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now)
|
|
return channel
|
|
}
|
|
channel.RemediationAction = "prefer_alternate_route"
|
|
channel.RemediationReason = "authorized_alternate_route_available"
|
|
channel.RemediationRouteID = alternate.RouteID
|
|
channel.RemediationRouteStatus = alternate.Status
|
|
channel.RemediationGuardStatus = guardStatus
|
|
channel.RemediationGuardReason = guardReason
|
|
channel.RecommendedOperatorAction = "Prefer the authorized alternate route for this active service channel."
|
|
channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now)
|
|
return channel
|
|
}
|
|
if containsString(channel.RouteFeedbackReasons, "service_channel_route_rebuild_recommended") || channel.RouteFeedbackStatus == "fenced" {
|
|
channel.RemediationAction = "rebuild_route"
|
|
channel.RemediationReason = "route_feedback_recommends_rebuild"
|
|
channel.RecommendedOperatorAction = "Trigger or wait for route rebuild; keep this distinct from backend fallback."
|
|
channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now)
|
|
return channel
|
|
}
|
|
channel.RemediationAction = "inspect_route_quality"
|
|
channel.RemediationReason = "degraded_route_quality_without_replacement"
|
|
channel.RecommendedOperatorAction = "Inspect rolling route quality counters and route feedback provenance."
|
|
channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now)
|
|
return channel
|
|
}
|
|
|
|
func fabricServiceChannelAccessRemediationCommand(channel FabricServiceChannelAccessTelemetryChannel, lease FabricServiceChannelLease, now time.Time) *FabricServiceChannelAccessRemediationCommand {
|
|
action := strings.TrimSpace(channel.RemediationAction)
|
|
if action == "" || action == "none" {
|
|
return nil
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
issuedAt := now.UTC()
|
|
expiresAt := issuedAt.Add(60 * time.Second)
|
|
if !channel.ExpiresAt.IsZero() && channel.ExpiresAt.Before(expiresAt) {
|
|
expiresAt = channel.ExpiresAt.UTC()
|
|
}
|
|
routeComponent := firstNonEmptyString(channel.RemediationRouteID, channel.PrimaryRouteID, "no-route")
|
|
return &FabricServiceChannelAccessRemediationCommand{
|
|
SchemaVersion: "rap.fabric_service_channel_access_remediation_command.v1",
|
|
CommandID: "fsc-remediation:" + channel.ChannelID + ":" + action + ":" + routeComponent,
|
|
Action: action,
|
|
ClusterID: lease.ClusterID,
|
|
ChannelID: channel.ChannelID,
|
|
ResourceID: channel.ResourceID,
|
|
ServiceClass: channel.ServiceClass,
|
|
EntryNodeID: channel.SelectedEntryNodeID,
|
|
ExitNodeID: channel.SelectedExitNodeID,
|
|
PrimaryRouteID: channel.PrimaryRouteID,
|
|
ReplacementRouteID: channel.RemediationRouteID,
|
|
ReplacementRouteStatus: channel.RemediationRouteStatus,
|
|
PoolPolicyFingerprint: channel.PoolPolicyFingerprint,
|
|
GuardStatus: firstNonEmptyString(channel.RemediationGuardStatus, "allowed"),
|
|
GuardReason: firstNonEmptyString(channel.RemediationGuardReason, "lease_pool_policy_allows_route"),
|
|
ExecutionStatus: channel.RemediationExecutionStatus,
|
|
ExecutionReason: channel.RemediationExecutionReason,
|
|
ExecutionGeneration: channel.RemediationExecutionGeneration,
|
|
ExecutionObservedAt: channel.RemediationExecutionObservedAt,
|
|
Reason: channel.RemediationReason,
|
|
OperatorAction: channel.RecommendedOperatorAction,
|
|
IssuedAt: issuedAt,
|
|
ExpiresAt: expiresAt,
|
|
}
|
|
}
|
|
|
|
func fabricServiceChannelAccessRemediationExecution(channel FabricServiceChannelAccessTelemetryChannel, routeManager map[string]any, transition map[string]any, now time.Time) FabricServiceChannelAccessTelemetryChannel {
|
|
if channel.RemediationCommand == nil {
|
|
return channel
|
|
}
|
|
if !channel.RemediationCommand.ExpiresAt.IsZero() && !now.IsZero() && !channel.RemediationCommand.ExpiresAt.After(now.UTC()) {
|
|
channel.RemediationExecutionStatus = "expired"
|
|
channel.RemediationExecutionReason = "remediation_command_ttl_expired"
|
|
return fabricServiceChannelSyncRemediationCommandExecution(channel)
|
|
}
|
|
if channel.RemediationGuardStatus == "rejected" || channel.RemediationCommand.GuardStatus == "rejected" {
|
|
channel.RemediationExecutionStatus = "rejected_by_policy_guard"
|
|
channel.RemediationExecutionReason = firstNonEmptyString(channel.RemediationGuardReason, channel.RemediationCommand.GuardReason, "remediation_guard_rejected")
|
|
return fabricServiceChannelSyncRemediationCommandExecution(channel)
|
|
}
|
|
switch channel.RemediationCommand.Action {
|
|
case "prefer_alternate_route":
|
|
if decision, ok := fabricServiceChannelRouteManagerDecisionForCommand(routeManager, *channel.RemediationCommand); ok {
|
|
channel.RemediationExecutionStatus = firstNonEmptyString(jsonString(decision, "rebuild_status"), "observed")
|
|
channel.RemediationExecutionReason = firstNonEmptyString(jsonString(decision, "rebuild_reason"), jsonString(decision, "decision_source"), "route_manager_decision_observed")
|
|
channel.RemediationExecutionGeneration = jsonString(decision, "generation")
|
|
channel.RemediationExecutionObservedAt = firstNonEmptyString(jsonString(routeManager, "last_applied_at"), jsonString(transition, "observed_at"))
|
|
return fabricServiceChannelSyncRemediationCommandExecution(channel)
|
|
}
|
|
channel.RemediationExecutionStatus = "waiting_node_apply"
|
|
channel.RemediationExecutionReason = "route_manager_has_not_reported_command"
|
|
channel.RemediationExecutionObservedAt = jsonString(transition, "observed_at")
|
|
case "rebuild_route":
|
|
if decision, ok := fabricServiceChannelRouteManagerDecisionForCommand(routeManager, *channel.RemediationCommand); ok {
|
|
channel.RemediationExecutionStatus = firstNonEmptyString(jsonString(decision, "rebuild_status"), "pending_rebuild_request")
|
|
channel.RemediationExecutionReason = firstNonEmptyString(jsonString(decision, "rebuild_reason"), jsonString(decision, "decision_source"), "route_manager_rebuild_decision_observed")
|
|
channel.RemediationExecutionGeneration = jsonString(decision, "generation")
|
|
channel.RemediationExecutionObservedAt = firstNonEmptyString(jsonString(routeManager, "last_applied_at"), jsonString(transition, "observed_at"))
|
|
return fabricServiceChannelSyncRemediationCommandExecution(channel)
|
|
}
|
|
channel.RemediationExecutionStatus = "pending_rebuild_request"
|
|
channel.RemediationExecutionReason = "bounded_rebuild_route_command_visible"
|
|
channel.RemediationExecutionObservedAt = jsonString(transition, "observed_at")
|
|
case "use_backend_fallback":
|
|
channel.RemediationExecutionStatus = "degraded_fallback_visible"
|
|
channel.RemediationExecutionReason = "backend_fallback_command_visible"
|
|
default:
|
|
channel.RemediationExecutionStatus = "visible"
|
|
channel.RemediationExecutionReason = "remediation_command_visible"
|
|
}
|
|
return fabricServiceChannelSyncRemediationCommandExecution(channel)
|
|
}
|
|
|
|
func fabricServiceChannelAccessRouteDecisionTelemetry(channel FabricServiceChannelAccessTelemetryChannel, routeManager map[string]any, transition map[string]any) FabricServiceChannelAccessTelemetryChannel {
|
|
decision, ok := fabricServiceChannelRouteManagerDecisionForChannel(routeManager, channel)
|
|
if !ok {
|
|
return channel
|
|
}
|
|
channel.RouteDecisionSource = jsonString(decision, "decision_source")
|
|
channel.RouteDecisionRouteID = jsonString(decision, "route_id")
|
|
channel.RouteDecisionReplacementRouteID = jsonString(decision, "replacement_route_id")
|
|
channel.RouteDecisionRebuildStatus = jsonString(decision, "rebuild_status")
|
|
channel.RouteDecisionRebuildReason = jsonString(decision, "rebuild_reason")
|
|
channel.RouteDecisionGeneration = firstNonEmptyString(jsonString(decision, "generation"), jsonString(decision, "rebuild_request_id"))
|
|
channel.RouteDecisionScoreReasons = jsonStringArray(decision, "score_reasons")
|
|
if channel.RemediationExecutionObservedAt == "" {
|
|
channel.RemediationExecutionObservedAt = firstNonEmptyString(jsonString(routeManager, "last_applied_at"), jsonString(transition, "observed_at"))
|
|
}
|
|
if channel.RouteDecisionSource == "service_channel_feedback_no_alternate" ||
|
|
channel.RouteDecisionRebuildStatus == "pending_degraded_fallback" ||
|
|
containsString(channel.RouteDecisionScoreReasons, "no_unfenced_alternate_route") {
|
|
channel.RemediationAction = firstNonEmptyString(channel.RemediationAction, "use_backend_fallback")
|
|
if channel.RemediationAction == "none" {
|
|
channel.RemediationAction = "use_backend_fallback"
|
|
}
|
|
channel.RemediationReason = "route_decision_no_safe_recovery"
|
|
channel.RemediationExecutionStatus = "route_rebuild_no_safe_recovery"
|
|
channel.RemediationExecutionReason = firstNonEmptyString(channel.RouteDecisionRebuildReason, "no_unfenced_alternate_route")
|
|
channel.RemediationExecutionGeneration = channel.RouteDecisionGeneration
|
|
channel.RecommendedOperatorAction = "No safe recovery route is available; keep degraded fallback visible and rebuild the route pool."
|
|
}
|
|
return channel
|
|
}
|
|
|
|
func fabricServiceChannelRouteManagerDecisionForChannel(routeManager map[string]any, channel FabricServiceChannelAccessTelemetryChannel) (map[string]any, bool) {
|
|
decisionsRaw := jsonArray(routeManager, "decisions")
|
|
if len(decisionsRaw) == 0 {
|
|
return nil, false
|
|
}
|
|
var selected map[string]any
|
|
selectedRank := 0
|
|
for _, raw := range decisionsRaw {
|
|
decision, ok := raw.(map[string]any)
|
|
if !ok || !fabricServiceChannelRouteManagerDecisionMatchesChannel(decision, channel) {
|
|
continue
|
|
}
|
|
rank := fabricServiceChannelRouteManagerDecisionTelemetryRank(decision)
|
|
if rank > selectedRank {
|
|
selected = decision
|
|
selectedRank = rank
|
|
}
|
|
}
|
|
if selected == nil {
|
|
return nil, false
|
|
}
|
|
return selected, true
|
|
}
|
|
|
|
func fabricServiceChannelRouteManagerDecisionMatchesChannel(decision map[string]any, channel FabricServiceChannelAccessTelemetryChannel) bool {
|
|
routeID := jsonString(decision, "route_id")
|
|
replacementRouteID := jsonString(decision, "replacement_route_id")
|
|
if routeID != "" && routeID == channel.PrimaryRouteID {
|
|
return true
|
|
}
|
|
if replacementRouteID != "" && replacementRouteID == channel.PrimaryRouteID {
|
|
return true
|
|
}
|
|
sourceNodeID := jsonString(decision, "source_node_id")
|
|
destinationNodeID := jsonString(decision, "destination_node_id")
|
|
localNodeID := jsonString(decision, "local_node_id")
|
|
return sourceNodeID != "" &&
|
|
destinationNodeID != "" &&
|
|
sourceNodeID == channel.SelectedEntryNodeID &&
|
|
destinationNodeID == channel.SelectedExitNodeID &&
|
|
(localNodeID == "" || localNodeID == channel.SelectedEntryNodeID)
|
|
}
|
|
|
|
func fabricServiceChannelRouteManagerDecisionTelemetryRank(decision map[string]any) int {
|
|
source := jsonString(decision, "decision_source")
|
|
status := jsonString(decision, "rebuild_status")
|
|
reasons := jsonStringArray(decision, "score_reasons")
|
|
switch {
|
|
case source == "service_channel_feedback_no_alternate" ||
|
|
status == "pending_degraded_fallback" ||
|
|
containsString(reasons, "no_unfenced_alternate_route"):
|
|
return 50
|
|
case status == "applied" || containsString(reasons, "service_channel_rebuild_applied"):
|
|
return 40
|
|
case strings.Contains(source, "replacement"):
|
|
return 30
|
|
case status != "":
|
|
return 20
|
|
default:
|
|
return 10
|
|
}
|
|
}
|
|
|
|
func fabricServiceChannelAccumulateRouteDecisionTelemetry(out *FabricServiceChannelAccessTelemetry, channel FabricServiceChannelAccessTelemetryChannel) {
|
|
if out == nil || channel.RouteDecisionSource == "" {
|
|
return
|
|
}
|
|
out.RouteDecisionChannelCount++
|
|
if fabricServiceChannelRouteDecisionIsReplacement(channel) {
|
|
out.ReplacementDecisionCount++
|
|
}
|
|
if channel.RouteDecisionRebuildStatus == "applied" || containsString(channel.RouteDecisionScoreReasons, "service_channel_rebuild_applied") {
|
|
out.AppliedRebuildDecisionCount++
|
|
}
|
|
if fabricServiceChannelRouteDecisionIsRecovery(channel) {
|
|
out.RecoveryDecisionCount++
|
|
}
|
|
if fabricServiceChannelRouteDecisionIsNoSafeRecovery(channel) {
|
|
out.NoSafeRecoveryDecisionCount++
|
|
}
|
|
}
|
|
|
|
func fabricServiceChannelRouteDecisionIsReplacement(channel FabricServiceChannelAccessTelemetryChannel) bool {
|
|
return strings.Contains(channel.RouteDecisionSource, "replacement") ||
|
|
strings.TrimSpace(channel.RouteDecisionReplacementRouteID) != ""
|
|
}
|
|
|
|
func fabricServiceChannelRouteDecisionIsRecovery(channel FabricServiceChannelAccessTelemetryChannel) bool {
|
|
return containsString(channel.RouteDecisionScoreReasons, "service_channel_recovery_promoted") ||
|
|
containsString(channel.RouteDecisionScoreReasons, "service_channel_recovery_hysteresis") ||
|
|
strings.Contains(channel.RouteDecisionRebuildReason, "recovery")
|
|
}
|
|
|
|
func fabricServiceChannelRouteDecisionIsNoSafeRecovery(channel FabricServiceChannelAccessTelemetryChannel) bool {
|
|
return channel.RouteDecisionSource == "service_channel_feedback_no_alternate" ||
|
|
channel.RouteDecisionRebuildStatus == "pending_degraded_fallback" ||
|
|
containsString(channel.RouteDecisionScoreReasons, "no_unfenced_alternate_route")
|
|
}
|
|
|
|
func fabricServiceChannelSyncRemediationCommandExecution(channel FabricServiceChannelAccessTelemetryChannel) FabricServiceChannelAccessTelemetryChannel {
|
|
if channel.RemediationCommand == nil {
|
|
return channel
|
|
}
|
|
channel.RemediationCommand.ExecutionStatus = channel.RemediationExecutionStatus
|
|
channel.RemediationCommand.ExecutionReason = channel.RemediationExecutionReason
|
|
channel.RemediationCommand.ExecutionGeneration = channel.RemediationExecutionGeneration
|
|
channel.RemediationCommand.ExecutionObservedAt = channel.RemediationExecutionObservedAt
|
|
return channel
|
|
}
|
|
|
|
func fabricServiceChannelRouteManagerDecisionForCommand(routeManager map[string]any, command FabricServiceChannelAccessRemediationCommand) (map[string]any, bool) {
|
|
decisionsRaw, ok := routeManager["decisions"].([]any)
|
|
if !ok {
|
|
return nil, false
|
|
}
|
|
for _, raw := range decisionsRaw {
|
|
decision, ok := raw.(map[string]any)
|
|
if !ok {
|
|
continue
|
|
}
|
|
if command.CommandID != "" && jsonString(decision, "rebuild_request_id") == command.CommandID {
|
|
return decision, true
|
|
}
|
|
if jsonString(decision, "route_id") == command.PrimaryRouteID &&
|
|
jsonString(decision, "replacement_route_id") == command.ReplacementRouteID &&
|
|
jsonString(decision, "decision_source") == "service_channel_remediation_command" {
|
|
return decision, true
|
|
}
|
|
}
|
|
return nil, false
|
|
}
|
|
|
|
func (s *Service) fabricServiceChannelAccessRemediationLedgerExecution(ctx context.Context, clusterID string, channel FabricServiceChannelAccessTelemetryChannel) FabricServiceChannelAccessTelemetryChannel {
|
|
if channel.RemediationCommand == nil || channel.RemediationCommand.Action != "rebuild_route" {
|
|
return channel
|
|
}
|
|
attempts, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{
|
|
ClusterID: clusterID,
|
|
ReporterNodeID: channel.SelectedEntryNodeID,
|
|
RouteID: channel.PrimaryRouteID,
|
|
ServiceClass: channel.ServiceClass,
|
|
RebuildRequestID: channel.RemediationCommand.CommandID,
|
|
Limit: 1,
|
|
})
|
|
if err != nil || len(attempts) == 0 {
|
|
return channel
|
|
}
|
|
attempt := attempts[0]
|
|
switch attempt.RebuildStatus {
|
|
case "requested":
|
|
if channel.RemediationExecutionStatus == "pending_degraded_fallback" {
|
|
channel.RemediationExecutionStatus = "rebuild_request_recorded_node_pending"
|
|
channel.RemediationExecutionReason = firstNonEmptyString(channel.RemediationExecutionReason, attempt.RebuildReason, "durable_rebuild_route_request_recorded_and_node_pending")
|
|
} else {
|
|
channel.RemediationExecutionStatus = "rebuild_request_recorded"
|
|
channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_request_recorded")
|
|
}
|
|
case "rejected":
|
|
channel.RemediationExecutionStatus = "rebuild_request_rejected"
|
|
channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_request_rejected")
|
|
case "applied":
|
|
channel.RemediationExecutionStatus = "rebuild_request_applied"
|
|
channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_request_applied")
|
|
case "no_alternate":
|
|
channel.RemediationExecutionStatus = "rebuild_request_no_alternate"
|
|
channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_no_alternate")
|
|
case "deferred_by_policy":
|
|
channel.RemediationExecutionStatus = "rebuild_request_deferred_by_policy"
|
|
channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_deferred_by_policy")
|
|
case "expired":
|
|
channel.RemediationExecutionStatus = "rebuild_request_expired"
|
|
channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_expired")
|
|
default:
|
|
channel.RemediationExecutionStatus = firstNonEmptyString(attempt.RebuildStatus, channel.RemediationExecutionStatus)
|
|
channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, channel.RemediationExecutionReason)
|
|
}
|
|
channel.RemediationExecutionGeneration = firstNonEmptyString(attempt.Generation, channel.RemediationExecutionGeneration)
|
|
if !attempt.UpdatedAt.IsZero() {
|
|
channel.RemediationExecutionObservedAt = attempt.UpdatedAt.UTC().Format(time.RFC3339Nano)
|
|
}
|
|
return fabricServiceChannelSyncRemediationCommandExecution(channel)
|
|
}
|
|
|
|
func (s *Service) fabricServiceChannelRemediationCommandsForNode(ctx context.Context, clusterID string, nodeID string, feedback map[string]fabricServiceChannelRouteFeedback, now time.Time) ([]FabricServiceChannelAccessRemediationCommand, error) {
|
|
records, err := s.store.ListFabricServiceChannelLeases(ctx, ListFabricServiceChannelLeasesInput{
|
|
ClusterID: clusterID,
|
|
EntryNodeID: nodeID,
|
|
ServiceClass: FabricServiceClassVPNPackets,
|
|
IncludeExpired: false,
|
|
Limit: 100,
|
|
Now: now.UTC(),
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
commands := make([]FabricServiceChannelAccessRemediationCommand, 0, len(records))
|
|
for _, record := range records {
|
|
summary := fabricServiceChannelLeaseSummaryFromRecord(record, now)
|
|
if summary.Expired || strings.TrimSpace(summary.PrimaryRouteID) == "" {
|
|
continue
|
|
}
|
|
channel := FabricServiceChannelAccessTelemetryChannel{
|
|
ChannelID: summary.ChannelID,
|
|
ResourceID: summary.ResourceID,
|
|
ServiceClass: summary.ServiceClass,
|
|
Status: summary.Status,
|
|
SelectedEntryNodeID: summary.SelectedEntryNodeID,
|
|
SelectedExitNodeID: summary.SelectedExitNodeID,
|
|
PrimaryRouteID: summary.PrimaryRouteID,
|
|
PrimaryRouteStatus: summary.PrimaryRouteStatus,
|
|
ForceBackendFallback: summary.ForceBackendFallback,
|
|
ExpiresAt: summary.ExpiresAt,
|
|
}
|
|
if record.Lease.PoolPolicy != nil {
|
|
channel.PoolPolicyFingerprint = record.Lease.PoolPolicy.Fingerprint
|
|
}
|
|
if item, ok := feedback[channel.PrimaryRouteID]; ok {
|
|
observedAt := item.ObservedAt
|
|
channel.RouteFeedbackObservedAt = &observedAt
|
|
if item.Fenced {
|
|
channel.RouteFeedbackStatus = "fenced"
|
|
} else if item.ScoreAdjustment < 0 {
|
|
channel.RouteFeedbackStatus = "degraded"
|
|
} else if item.RouteID != "" {
|
|
channel.RouteFeedbackStatus = "healthy"
|
|
}
|
|
channel.RouteFeedbackScoreAdjustment = item.ScoreAdjustment
|
|
channel.RouteFeedbackEffectiveScoreAdjustment = item.ScoreAdjustment
|
|
channel.RouteFeedbackReasons = append([]string{}, item.Reasons...)
|
|
channel.RouteQualityWindowSampleCount = item.QualityWindowSampleCount
|
|
channel.RouteQualityWindowFailureCount = item.QualityWindowFailureCount
|
|
channel.RouteQualityWindowDropCount = item.QualityWindowDropCount
|
|
channel.RouteQualityWindowSlowCount = item.QualityWindowSlowCount
|
|
channel.LastSendDurationMs = item.LastSendDurationMs
|
|
}
|
|
channel = fabricServiceChannelAccessRemediation(channel, record.Lease, now)
|
|
if channel.RemediationCommand != nil {
|
|
commands = append(commands, *channel.RemediationCommand)
|
|
}
|
|
}
|
|
sort.SliceStable(commands, func(i, j int) bool {
|
|
if commands[i].Action != commands[j].Action {
|
|
return commands[i].Action < commands[j].Action
|
|
}
|
|
return commands[i].CommandID < commands[j].CommandID
|
|
})
|
|
return commands, nil
|
|
}
|
|
|
|
func (s *Service) recordFabricServiceChannelRemediationRebuildIntents(ctx context.Context, clusterID string, nodeID string, commands []FabricServiceChannelAccessRemediationCommand, now time.Time) error {
|
|
if len(commands) == 0 {
|
|
return nil
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
for _, command := range commands {
|
|
if command.Action != "rebuild_route" || strings.TrimSpace(command.CommandID) == "" || strings.TrimSpace(command.PrimaryRouteID) == "" {
|
|
continue
|
|
}
|
|
rebuildStatus := "requested"
|
|
outcome := "rebuild_requested"
|
|
if command.GuardStatus == "rejected" {
|
|
rebuildStatus = "rejected"
|
|
outcome = "policy_guard_rejected"
|
|
}
|
|
payload := mustJSONRaw(map[string]any{
|
|
"schema_version": "c18z75.service_channel_remediation_rebuild_intent.v1",
|
|
"command_id": command.CommandID,
|
|
"channel_id": command.ChannelID,
|
|
"resource_id": command.ResourceID,
|
|
"entry_node_id": command.EntryNodeID,
|
|
"exit_node_id": command.ExitNodeID,
|
|
"pool_policy_fingerprint": command.PoolPolicyFingerprint,
|
|
"guard_status": command.GuardStatus,
|
|
"guard_reason": command.GuardReason,
|
|
"command_expires_at": command.ExpiresAt.UTC().Format(time.RFC3339Nano),
|
|
"recorded_at": now.UTC().Format(time.RFC3339Nano),
|
|
})
|
|
_, err := s.store.RecordFabricServiceChannelRouteRebuildAttempt(ctx, RecordFabricServiceChannelRouteRebuildAttemptInput{
|
|
ClusterID: clusterID,
|
|
ReporterNodeID: nodeID,
|
|
ServiceClass: firstNonEmptyString(command.ServiceClass, FabricServiceClassVPNPackets),
|
|
RouteID: command.PrimaryRouteID,
|
|
ReplacementRouteID: command.ReplacementRouteID,
|
|
RebuildRequestID: command.CommandID,
|
|
RebuildStatus: rebuildStatus,
|
|
RebuildReason: firstNonEmptyString(command.Reason, command.GuardReason, "service_channel_remediation_rebuild_route_requested"),
|
|
DecisionSource: "service_channel_remediation_command",
|
|
Outcome: outcome,
|
|
Generation: command.ExecutionGeneration,
|
|
PolicyFingerprint: command.PoolPolicyFingerprint,
|
|
ObservedPolicyFingerprint: command.PoolPolicyFingerprint,
|
|
FeedbackReasons: []string{firstNonEmptyString(command.Reason, command.GuardReason, "service_channel_remediation_rebuild_route_requested")},
|
|
OldHops: []string{},
|
|
ReplacementHops: []string{},
|
|
Payload: payload,
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Service) resolveFabricServiceChannelRemediationRebuildIntents(ctx context.Context, input GetNodeSyntheticMeshConfigInput, commands []FabricServiceChannelAccessRemediationCommand, intents []MeshRouteIntent, feedback map[string]fabricServiceChannelRouteFeedback, generation string, now time.Time) ([]RoutePathDecision, error) {
|
|
if len(commands) == 0 {
|
|
return nil, nil
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
decisions := []RoutePathDecision{}
|
|
for _, command := range commands {
|
|
if command.Action != "rebuild_route" || strings.TrimSpace(command.CommandID) == "" || strings.TrimSpace(command.PrimaryRouteID) == "" {
|
|
continue
|
|
}
|
|
lease, leaseOK, err := s.fabricServiceChannelLeaseForRemediationCommand(ctx, input.ClusterID, input.NodeID, command, now)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
status := "no_alternate"
|
|
outcome := "no_alternate"
|
|
reason := "no_unfenced_alternate_route"
|
|
var primary SyntheticMeshRouteConfig
|
|
var replacement SyntheticMeshRouteConfig
|
|
if command.GuardStatus == "rejected" {
|
|
status = "deferred_by_policy"
|
|
outcome = "deferred_by_policy"
|
|
reason = firstNonEmptyString(command.GuardReason, "remediation_guard_rejected")
|
|
} else if !command.ExpiresAt.IsZero() && !command.ExpiresAt.After(now.UTC()) {
|
|
status = "expired"
|
|
outcome = "expired"
|
|
reason = "remediation_command_ttl_expired"
|
|
} else if !leaseOK {
|
|
status = "deferred_by_policy"
|
|
outcome = "deferred_by_policy"
|
|
reason = "active_lease_not_found_for_rebuild_resolution"
|
|
} else {
|
|
var ok bool
|
|
primary, ok = s.syntheticRouteByID(input, intents, command.PrimaryRouteID)
|
|
if !ok {
|
|
reason = "primary_route_not_available_for_rebuild"
|
|
} else if selected, _, ok := s.selectServiceChannelRouteReplacement(input, primary, intents, feedback); ok {
|
|
if guardStatus, guardReason := fabricServiceChannelRouteAllowedByLeasePool(lease, FabricServiceChannelRoute{
|
|
RouteID: selected.RouteID,
|
|
ClusterID: selected.ClusterID,
|
|
ServiceClass: firstNonEmptyString(command.ServiceClass, FabricServiceClassVPNPackets),
|
|
SourceNodeID: selected.SourceNodeID,
|
|
DestinationNodeID: selected.DestinationNodeID,
|
|
Status: "authorized",
|
|
}); guardStatus != "allowed" {
|
|
status = "deferred_by_policy"
|
|
outcome = "deferred_by_policy"
|
|
reason = guardReason
|
|
} else {
|
|
replacement = selected
|
|
status = "applied"
|
|
outcome = "replacement_selected"
|
|
reason = "remediation_rebuild_applied_to_alternate"
|
|
}
|
|
}
|
|
}
|
|
feedbackItem := feedback[command.PrimaryRouteID]
|
|
feedbackStatus := ""
|
|
if feedbackItem.Fenced {
|
|
feedbackStatus = "fenced"
|
|
} else if feedbackItem.ScoreAdjustment < 0 {
|
|
feedbackStatus = "degraded"
|
|
} else if feedbackItem.RouteID != "" {
|
|
feedbackStatus = "healthy"
|
|
}
|
|
payload := mustJSONRaw(map[string]any{
|
|
"schema_version": "c18z77.service_channel_remediation_rebuild_resolution.v1",
|
|
"command_id": command.CommandID,
|
|
"channel_id": command.ChannelID,
|
|
"resource_id": command.ResourceID,
|
|
"entry_node_id": command.EntryNodeID,
|
|
"exit_node_id": command.ExitNodeID,
|
|
"pool_policy_fingerprint": command.PoolPolicyFingerprint,
|
|
"guard_status": command.GuardStatus,
|
|
"guard_reason": command.GuardReason,
|
|
"resolution_status": status,
|
|
"resolution_outcome": outcome,
|
|
"resolution_reason": reason,
|
|
"resolved_at": now.UTC().Format(time.RFC3339Nano),
|
|
})
|
|
_, err = s.store.RecordFabricServiceChannelRouteRebuildAttempt(ctx, RecordFabricServiceChannelRouteRebuildAttemptInput{
|
|
ClusterID: input.ClusterID,
|
|
ReporterNodeID: input.NodeID,
|
|
ServiceClass: firstNonEmptyString(command.ServiceClass, FabricServiceClassVPNPackets),
|
|
RouteID: command.PrimaryRouteID,
|
|
ReplacementRouteID: replacement.RouteID,
|
|
RebuildRequestID: command.CommandID,
|
|
RebuildStatus: status,
|
|
RebuildReason: reason,
|
|
DecisionSource: "service_channel_remediation_command",
|
|
Outcome: outcome,
|
|
Generation: firstNonEmptyString(generation, command.ExecutionGeneration, command.CommandID),
|
|
PolicyFingerprint: command.PoolPolicyFingerprint,
|
|
ObservedPolicyFingerprint: command.PoolPolicyFingerprint,
|
|
FeedbackStatus: feedbackStatus,
|
|
FeedbackScoreAdjustment: feedbackItem.ScoreAdjustment,
|
|
FeedbackEffectiveScoreAdjustment: feedbackItem.ScoreAdjustment,
|
|
FeedbackReasons: append([]string{reason}, feedbackItem.Reasons...),
|
|
LastError: feedbackItem.LastError,
|
|
ConsecutiveFailures: feedbackItem.ConsecutiveFailures,
|
|
StallCount: feedbackItem.StallCount,
|
|
LastSendDurationMs: feedbackItem.LastSendDurationMs,
|
|
QualityWindowSampleCount: feedbackItem.QualityWindowSampleCount,
|
|
QualityWindowFailureCount: feedbackItem.QualityWindowFailureCount,
|
|
QualityWindowDropCount: feedbackItem.QualityWindowDropCount,
|
|
QualityWindowSlowCount: feedbackItem.QualityWindowSlowCount,
|
|
OldHops: append([]string{}, primary.Hops...),
|
|
ReplacementHops: append([]string{}, replacement.Hops...),
|
|
Payload: payload,
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if status != "applied" {
|
|
continue
|
|
}
|
|
decision := RoutePathDecision{
|
|
DecisionID: command.PrimaryRouteID + "-path-" + input.NodeID + "-service-channel-remediation",
|
|
RouteID: command.PrimaryRouteID,
|
|
ReplacementRouteID: replacement.RouteID,
|
|
RebuildRequestID: command.CommandID,
|
|
RebuildStatus: "applied",
|
|
RebuildReason: reason,
|
|
ClusterID: input.ClusterID,
|
|
LocalNodeID: input.NodeID,
|
|
SourceNodeID: primary.SourceNodeID,
|
|
DestinationNodeID: primary.DestinationNodeID,
|
|
OriginalHops: append([]string{}, primary.Hops...),
|
|
EffectiveHops: append([]string{}, replacement.Hops...),
|
|
DecisionSource: "service_channel_remediation_command",
|
|
Generation: firstNonEmptyString(generation, command.CommandID),
|
|
PathScore: serviceChannelReplacementRouteScore(replacement),
|
|
ScoreReasons: []string{"service_channel_remediation_rebuild_route", "selected_unfenced_alternate_route", "service_channel_rebuild_applied"},
|
|
ControlPlaneOnly: true,
|
|
ProductionForwarding: false,
|
|
ExpiresAt: minNonZeroTime(primary.ExpiresAt, replacement.ExpiresAt, command.ExpiresAt, now.Add(60*time.Second)).UTC(),
|
|
}
|
|
decision.PreviousHopID, decision.NextHopID, decision.LocalRole = routePathLocalPosition(decision.EffectiveHops, input.NodeID, "", "")
|
|
decisions = append(decisions, decision)
|
|
}
|
|
return decisions, nil
|
|
}
|
|
|
|
func (s *Service) fabricServiceChannelLeaseForRemediationCommand(ctx context.Context, clusterID string, nodeID string, command FabricServiceChannelAccessRemediationCommand, now time.Time) (FabricServiceChannelLease, bool, error) {
|
|
records, err := s.store.ListFabricServiceChannelLeases(ctx, ListFabricServiceChannelLeasesInput{
|
|
ClusterID: clusterID,
|
|
ServiceClass: firstNonEmptyString(command.ServiceClass, FabricServiceClassVPNPackets),
|
|
EntryNodeID: nodeID,
|
|
ResourceID: command.ResourceID,
|
|
IncludeExpired: false,
|
|
Limit: 100,
|
|
Now: now.UTC(),
|
|
})
|
|
if err != nil {
|
|
return FabricServiceChannelLease{}, false, err
|
|
}
|
|
for _, record := range records {
|
|
if strings.TrimSpace(record.ChannelID) == strings.TrimSpace(command.ChannelID) {
|
|
return record.Lease, true, nil
|
|
}
|
|
}
|
|
return FabricServiceChannelLease{}, false, nil
|
|
}
|
|
|
|
func (s *Service) syntheticRouteByID(input GetNodeSyntheticMeshConfigInput, intents []MeshRouteIntent, routeID string) (SyntheticMeshRouteConfig, bool) {
|
|
routeID = strings.TrimSpace(routeID)
|
|
if routeID == "" {
|
|
return SyntheticMeshRouteConfig{}, false
|
|
}
|
|
for _, intent := range intents {
|
|
route, _, _, _, _, ok := s.syntheticRouteFromIntent(input, intent)
|
|
if ok && route.RouteID == routeID {
|
|
return route, true
|
|
}
|
|
}
|
|
return SyntheticMeshRouteConfig{}, false
|
|
}
|
|
|
|
func minNonZeroTime(items ...time.Time) time.Time {
|
|
var out time.Time
|
|
for _, item := range items {
|
|
if item.IsZero() {
|
|
continue
|
|
}
|
|
if out.IsZero() || item.Before(out) {
|
|
out = item
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func fabricServiceChannelFirstAuthorizedAlternate(routes []FabricServiceChannelRoute, primaryRouteID string) (FabricServiceChannelRoute, bool) {
|
|
for _, route := range routes {
|
|
if strings.TrimSpace(route.RouteID) == "" || route.RouteID == primaryRouteID {
|
|
continue
|
|
}
|
|
if route.Status == "authorized" {
|
|
return route, true
|
|
}
|
|
}
|
|
return FabricServiceChannelRoute{}, false
|
|
}
|
|
|
|
func fabricServiceChannelRouteAllowedByLeasePool(lease FabricServiceChannelLease, route FabricServiceChannelRoute) (string, string) {
|
|
if strings.TrimSpace(route.RouteID) == "" {
|
|
return "rejected", "replacement_route_missing"
|
|
}
|
|
entryAllowed := len(lease.EntryPool) == 0
|
|
for _, candidate := range lease.EntryPool {
|
|
if candidate.NodeID == route.SourceNodeID {
|
|
entryAllowed = true
|
|
break
|
|
}
|
|
}
|
|
if !entryAllowed {
|
|
return "rejected", "replacement_entry_outside_signed_pool_policy"
|
|
}
|
|
exitAllowed := len(lease.ExitPool) == 0
|
|
for _, candidate := range lease.ExitPool {
|
|
if candidate.NodeID == route.DestinationNodeID {
|
|
exitAllowed = true
|
|
break
|
|
}
|
|
}
|
|
if !exitAllowed {
|
|
return "rejected", "replacement_exit_outside_signed_pool_policy"
|
|
}
|
|
return "allowed", "lease_pool_policy_allows_route"
|
|
}
|
|
|
|
func fabricServiceChannelLeaseSummaryFromRecord(record FabricServiceChannelLeaseRecord, now time.Time) FabricServiceChannelLeaseSummary {
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
lease := record.Lease
|
|
summary := FabricServiceChannelLeaseSummary{
|
|
ClusterID: record.ClusterID,
|
|
ChannelID: record.ChannelID,
|
|
ResourceID: firstNonEmptyString(record.ResourceID, lease.ResourceID),
|
|
ServiceClass: firstNonEmptyString(record.ServiceClass, lease.ServiceClass),
|
|
Status: lease.Status,
|
|
SelectedEntryNodeID: firstNonEmptyString(record.SelectedEntryNodeID, lease.SelectedEntryNodeID),
|
|
SelectedExitNodeID: lease.SelectedExitNodeID,
|
|
AllowedChannels: append([]string{}, lease.AllowedChannels...),
|
|
PrimaryRouteID: strings.TrimSpace(lease.PrimaryRoute.RouteID),
|
|
PrimaryRouteStatus: strings.TrimSpace(lease.PrimaryRoute.Status),
|
|
DataPlane: lease.DataPlane,
|
|
ForceBackendFallback: lease.Status == FabricServiceChannelStatusDegradedFallback || lease.PrimaryRoute.Status == "missing_route_intent",
|
|
IssuedAt: lease.IssuedAt,
|
|
ExpiresAt: record.ExpiresAt,
|
|
CreatedAt: record.CreatedAt,
|
|
UpdatedAt: record.UpdatedAt,
|
|
}
|
|
if summary.ExpiresAt.IsZero() {
|
|
summary.ExpiresAt = lease.ExpiresAt
|
|
}
|
|
summary.Expired = !summary.ExpiresAt.IsZero() && !summary.ExpiresAt.After(now.UTC())
|
|
return summary
|
|
}
|
|
|
|
func fabricServiceChannelLeaseCacheKey(clusterID string, channelID string) string {
|
|
return strings.TrimSpace(clusterID) + "/" + strings.TrimSpace(channelID)
|
|
}
|
|
|
|
func (s *Service) signFabricServiceChannelLease(ctx context.Context, lease FabricServiceChannelLease) (FabricServiceChannelLease, error) {
|
|
authorityKey, err := s.ensureClusterAuthority(ctx, lease.ClusterID, nil)
|
|
if err != nil {
|
|
return lease, err
|
|
}
|
|
payload := FabricServiceChannelLeaseAuthorityPayload{
|
|
SchemaVersion: "rap.fabric_service_channel_lease_authority.v1",
|
|
ChannelID: lease.ChannelID,
|
|
ClusterID: lease.ClusterID,
|
|
OrganizationID: lease.OrganizationID,
|
|
UserID: lease.UserID,
|
|
ResourceID: lease.ResourceID,
|
|
ServiceClass: lease.ServiceClass,
|
|
Status: lease.Status,
|
|
SelectedEntryNodeID: lease.SelectedEntryNodeID,
|
|
SelectedExitNodeID: lease.SelectedExitNodeID,
|
|
EntryPool: append([]FabricServiceChannelNodeCandidate{}, lease.EntryPool...),
|
|
ExitPool: append([]FabricServiceChannelNodeCandidate{}, lease.ExitPool...),
|
|
AllowedChannels: append([]string{}, lease.AllowedChannels...),
|
|
PrimaryRoute: lease.PrimaryRoute,
|
|
RecoveryPolicy: lease.RecoveryPolicy,
|
|
PoolPolicy: lease.PoolPolicy,
|
|
DataPlane: lease.DataPlane,
|
|
RouteGeneration: lease.RouteGeneration,
|
|
FencingEpoch: lease.FencingEpoch,
|
|
TokenHash: fabricServiceChannelTokenHash(lease.Token.Token),
|
|
IssuedAt: lease.IssuedAt,
|
|
ExpiresAt: lease.ExpiresAt,
|
|
}
|
|
rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now())
|
|
if err != nil {
|
|
return lease, err
|
|
}
|
|
lease.AuthorityPayload = rawPayload
|
|
lease.AuthoritySignature = &signature
|
|
return lease, nil
|
|
}
|
|
|
|
func fabricServiceChannelTokenHash(token string) string {
|
|
sum := sha256.Sum256([]byte(strings.TrimSpace(token)))
|
|
return hex.EncodeToString(sum[:])
|
|
}
|
|
|
|
func normalizeFabricServiceClass(value string) string {
|
|
return strings.TrimSpace(strings.ToLower(value))
|
|
}
|
|
|
|
func isAllowedFabricServiceClass(value string) bool {
|
|
switch value {
|
|
case FabricServiceClassVPNPackets,
|
|
FabricServiceClassRemoteWorkspace,
|
|
FabricServiceClassFileTransfer,
|
|
FabricServiceClassVideo:
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func normalizeFabricServiceChannels(channels []string, serviceClass string) []string {
|
|
channels = dedupeStrings(channels)
|
|
if len(channels) > 0 {
|
|
return channels
|
|
}
|
|
switch serviceClass {
|
|
case FabricServiceClassVPNPackets:
|
|
return []string{FabricChannelControl, FabricChannelBulk, "vpn_packet"}
|
|
case FabricServiceClassRemoteWorkspace:
|
|
return []string{FabricChannelControl, FabricChannelInteractive, FabricChannelReliable, FabricChannelDroppable}
|
|
case FabricServiceClassVideo:
|
|
return []string{FabricChannelControl, FabricChannelInteractive, FabricChannelDroppable}
|
|
case FabricServiceClassFileTransfer:
|
|
return []string{FabricChannelControl, FabricChannelReliable, FabricChannelBulk}
|
|
default:
|
|
return []string{FabricChannelControl, FabricChannelReliable}
|
|
}
|
|
}
|
|
|
|
func normalizeFabricRequiredRoles(roles []string, serviceClass string) []string {
|
|
roles = dedupeStrings(roles)
|
|
if len(roles) > 0 {
|
|
return roles
|
|
}
|
|
switch serviceClass {
|
|
case FabricServiceClassVPNPackets:
|
|
return []string{"entry-node", "vpn-exit"}
|
|
case FabricServiceClassRemoteWorkspace:
|
|
return []string{"entry-node", "rdp-worker"}
|
|
case FabricServiceClassVideo:
|
|
return []string{"entry-node", "video-relay"}
|
|
case FabricServiceClassFileTransfer:
|
|
return []string{"entry-node", "file-storage-cache"}
|
|
default:
|
|
return []string{"entry-node"}
|
|
}
|
|
}
|
|
|
|
func selectFabricServiceChannelPreferredNode(nodeIDs []string, preferred string) string {
|
|
preferred = strings.TrimSpace(preferred)
|
|
if preferred != "" && containsString(nodeIDs, preferred) {
|
|
return preferred
|
|
}
|
|
if len(nodeIDs) == 0 {
|
|
return ""
|
|
}
|
|
return strings.TrimSpace(nodeIDs[0])
|
|
}
|
|
|
|
func fabricServiceChannelEffectivePool(requested []string, policy []string) []string {
|
|
requested = dedupeStrings(requested)
|
|
policy = dedupeStrings(policy)
|
|
if len(policy) == 0 {
|
|
return requested
|
|
}
|
|
if len(requested) == 0 {
|
|
return policy
|
|
}
|
|
out := []string{}
|
|
for _, nodeID := range requested {
|
|
if containsString(policy, nodeID) {
|
|
out = append(out, nodeID)
|
|
}
|
|
}
|
|
return dedupeStrings(out)
|
|
}
|
|
|
|
func fabricServiceFailoverFromPoolPolicy(policy FabricServiceChannelPoolPolicy) string {
|
|
policy = normalizeFabricServiceChannelPoolPolicy(policy, defaultFabricServiceChannelPoolPolicy())
|
|
raw, err := json.Marshal(map[string]any{
|
|
"route_rebuild": policy.RouteRebuild,
|
|
"entry_failover": policy.EntryFailover,
|
|
"exit_failover": policy.ExitFailover,
|
|
"sticky_session": policy.StickySession,
|
|
"backend_fallback_allowed": policy.BackendFallbackAllowed,
|
|
"selection_strategy": policy.SelectionStrategy,
|
|
"pool_policy_fingerprint": policy.Fingerprint,
|
|
})
|
|
if err != nil {
|
|
return defaultFabricServiceFailover()
|
|
}
|
|
return string(raw)
|
|
}
|
|
|
|
func fabricServiceChannelNodePool(nodeIDs []string, role string, selected string) []FabricServiceChannelNodeCandidate {
|
|
out := make([]FabricServiceChannelNodeCandidate, 0, len(nodeIDs))
|
|
for index, nodeID := range nodeIDs {
|
|
status := "candidate"
|
|
if nodeID == selected {
|
|
status = "selected"
|
|
}
|
|
out = append(out, FabricServiceChannelNodeCandidate{
|
|
NodeID: nodeID,
|
|
Role: role,
|
|
Priority: index + 1,
|
|
Status: status,
|
|
Metadata: json.RawMessage(`{}`),
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
type fabricServiceChannelRouteFeedback struct {
|
|
RouteID string
|
|
ObservationID string
|
|
Source string
|
|
ChannelID string
|
|
ResourceID string
|
|
ViolationStatus string
|
|
ViolationReason string
|
|
Fenced bool
|
|
ManualRetry bool
|
|
StalePolicy bool
|
|
StaleGeneration bool
|
|
ProvenanceMissing bool
|
|
StaleReason string
|
|
ScoreAdjustment int
|
|
Reasons []string
|
|
LastError string
|
|
ConsecutiveFailures int
|
|
StallCount int
|
|
LastSendDurationMs int64
|
|
DegradedFallbackRecommended bool
|
|
RouteRebuildRecommended bool
|
|
QualityWindowSampleCount int
|
|
QualityWindowSuccessCount int
|
|
QualityWindowFailureCount int
|
|
QualityWindowSlowCount int
|
|
QualityWindowDropCount int
|
|
ObservedAt time.Time
|
|
ExpiresAt time.Time
|
|
RetryCooldownUntil *time.Time
|
|
}
|
|
|
|
type fabricServiceChannelRouteProvenance struct {
|
|
RouteID string
|
|
RouteVersion string
|
|
PolicyVersion string
|
|
RouteGeneration string
|
|
}
|
|
|
|
func fabricServiceChannelRouteProvenanceFromIntents(intents []MeshRouteIntent) map[string]fabricServiceChannelRouteProvenance {
|
|
out := map[string]fabricServiceChannelRouteProvenance{}
|
|
for _, intent := range intents {
|
|
if strings.TrimSpace(intent.ID) == "" {
|
|
continue
|
|
}
|
|
var policy syntheticRoutePolicy
|
|
_ = json.Unmarshal(intent.Policy, &policy)
|
|
routeVersion := strings.TrimSpace(policy.RouteVersion)
|
|
if routeVersion == "" {
|
|
routeVersion = intent.UpdatedAt.UTC().Format(time.RFC3339)
|
|
}
|
|
policyVersion := strings.TrimSpace(policy.PolicyVersion)
|
|
if policyVersion == "" {
|
|
policyVersion = routeVersion
|
|
}
|
|
out[intent.ID] = fabricServiceChannelRouteProvenance{
|
|
RouteID: intent.ID,
|
|
RouteVersion: routeVersion,
|
|
PolicyVersion: policyVersion,
|
|
RouteGeneration: policyVersion,
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func (s *Service) fabricServiceChannelRouteFeedback(ctx context.Context, clusterID string, entryNodeIDs []string, now time.Time, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) (map[string]fabricServiceChannelRouteFeedback, error) {
|
|
out := map[string]fabricServiceChannelRouteFeedback{}
|
|
policy = normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy())
|
|
for _, nodeID := range dedupeStrings(entryNodeIDs) {
|
|
if strings.TrimSpace(nodeID) == "" {
|
|
continue
|
|
}
|
|
observations, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{
|
|
ClusterID: clusterID,
|
|
ReporterNodeID: nodeID,
|
|
ServiceClass: FabricServiceClassVPNPackets,
|
|
Now: now,
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
mergeFabricServiceChannelRouteFeedback(out, fabricServiceChannelRouteFeedbackFromObservationsWithProvenance(observations, now, policy, routeProvenance))
|
|
expiredObservations, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{
|
|
ClusterID: clusterID,
|
|
ReporterNodeID: nodeID,
|
|
ServiceClass: FabricServiceClassVPNPackets,
|
|
IncludeExpired: true,
|
|
Now: now,
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
mergeFabricServiceChannelRouteFeedback(out, fabricServiceChannelManualRetryFeedbackFromObservationsWithProvenance(expiredObservations, now, policy, routeProvenance))
|
|
if len(observations) > 0 {
|
|
continue
|
|
}
|
|
heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 1)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if len(heartbeats) == 0 || now.Sub(heartbeats[0].ObservedAt.UTC()) > fabricServiceChannelFeedbackMaxAge {
|
|
continue
|
|
}
|
|
mergeFabricServiceChannelRouteFeedback(out, fabricServiceChannelRouteFeedbackFromHeartbeatWithProvenance(heartbeats[0], now, policy, routeProvenance))
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func (s *Service) fabricServiceChannelRecoveryPolicy(ctx context.Context, clusterID string) FabricServiceChannelRecoveryPolicy {
|
|
cluster, err := s.store.GetCluster(ctx, strings.TrimSpace(clusterID))
|
|
if err != nil {
|
|
return defaultFabricServiceChannelRecoveryPolicy()
|
|
}
|
|
return fabricServiceChannelRecoveryPolicyFromCluster(cluster)
|
|
}
|
|
|
|
func (s *Service) recordFabricServiceChannelRouteFeedback(ctx context.Context, heartbeat NodeHeartbeat) error {
|
|
if strings.TrimSpace(heartbeat.ClusterID) == "" || strings.TrimSpace(heartbeat.NodeID) == "" {
|
|
return nil
|
|
}
|
|
observedAt := heartbeat.ObservedAt.UTC()
|
|
if observedAt.IsZero() {
|
|
observedAt = s.now().UTC()
|
|
}
|
|
expiresAt := observedAt.Add(fabricServiceChannelFeedbackMaxAge)
|
|
for _, input := range fabricServiceChannelRouteFeedbackInputsFromHeartbeat(heartbeat, FabricServiceClassVPNPackets, expiresAt) {
|
|
if _, err := s.store.RecordFabricServiceChannelRouteFeedback(ctx, input); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
for _, input := range s.fabricServiceChannelRouteFeedbackInputsFromAccessReport(ctx, heartbeat, FabricServiceClassVPNPackets, expiresAt) {
|
|
if _, err := s.store.RecordFabricServiceChannelRouteFeedback(ctx, input); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Service) fabricServiceChannelRouteFeedbackInputsFromAccessReport(ctx context.Context, heartbeat NodeHeartbeat, serviceClass string, expiresAt time.Time) []RecordFabricServiceChannelRouteFeedbackInput {
|
|
if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) {
|
|
return nil
|
|
}
|
|
report := jsonMapPath(jsonObject(heartbeat.Metadata), "fabric_service_channel_access_report")
|
|
if len(report) == 0 {
|
|
return nil
|
|
}
|
|
if jsonInt(report, "fabric_route_send_failure") <= 0 {
|
|
return nil
|
|
}
|
|
status := jsonString(report, "last_data_plane_violation_status")
|
|
if status != "fabric_route_send_failed_backend_fallback_blocked" {
|
|
return nil
|
|
}
|
|
observedAt := heartbeat.ObservedAt.UTC()
|
|
if observedAt.IsZero() {
|
|
observedAt = time.Now().UTC()
|
|
}
|
|
records, err := s.store.ListFabricServiceChannelLeases(ctx, ListFabricServiceChannelLeasesInput{
|
|
ClusterID: heartbeat.ClusterID,
|
|
EntryNodeID: heartbeat.NodeID,
|
|
ServiceClass: serviceClass,
|
|
IncludeExpired: false,
|
|
Limit: 100,
|
|
Now: observedAt,
|
|
})
|
|
if err != nil || len(records) == 0 {
|
|
return nil
|
|
}
|
|
reason := firstNonEmptyString(jsonString(report, "last_data_plane_violation_reason"), "fabric_route_send_failed_backend_fallback_blocked")
|
|
out := make([]RecordFabricServiceChannelRouteFeedbackInput, 0, len(records))
|
|
for _, record := range records {
|
|
summary := fabricServiceChannelLeaseSummaryFromRecord(record, observedAt)
|
|
routeID := strings.TrimSpace(summary.PrimaryRouteID)
|
|
if summary.Expired || routeID == "" || summary.ForceBackendFallback {
|
|
continue
|
|
}
|
|
if s.fabricServiceChannelHasActiveAccessReportRouteFeedback(ctx, heartbeat.ClusterID, heartbeat.NodeID, routeID, serviceClass, observedAt) {
|
|
continue
|
|
}
|
|
out = append(out, RecordFabricServiceChannelRouteFeedbackInput{
|
|
ClusterID: heartbeat.ClusterID,
|
|
ReporterNodeID: heartbeat.NodeID,
|
|
RouteID: routeID,
|
|
ServiceClass: serviceClass,
|
|
FeedbackStatus: "fenced",
|
|
ScoreAdjustment: -1030,
|
|
Reasons: []string{"service_channel_route_rebuild_recommended", "data_plane_fabric_route_send_failed", "backend_fallback_blocked_by_policy"},
|
|
LastError: reason,
|
|
ConsecutiveFailures: maxInt(1, jsonInt(report, "fabric_route_send_failure")),
|
|
Payload: mustJSONRaw(map[string]any{
|
|
"source": "fabric_service_channel_access_report",
|
|
"channel_id": summary.ChannelID,
|
|
"resource_id": summary.ResourceID,
|
|
"last_data_plane_violation_status": status,
|
|
"last_data_plane_violation_reason": reason,
|
|
"backend_fallback_blocked": jsonInt(report, "backend_fallback_blocked"),
|
|
"fabric_route_send_failure": jsonInt(report, "fabric_route_send_failure"),
|
|
"last_backend_relay_policy": jsonString(report, "last_backend_relay_policy"),
|
|
"last_working_data_transport": jsonString(report, "last_working_data_transport"),
|
|
"last_steady_state_transport": jsonString(report, "last_steady_state_transport"),
|
|
}),
|
|
ObservedAt: observedAt,
|
|
ExpiresAt: expiresAt,
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func (s *Service) fabricServiceChannelHasActiveAccessReportRouteFeedback(ctx context.Context, clusterID, reporterNodeID, routeID, serviceClass string, observedAt time.Time) bool {
|
|
observations, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{
|
|
ClusterID: clusterID,
|
|
ReporterNodeID: reporterNodeID,
|
|
RouteID: routeID,
|
|
ServiceClass: serviceClass,
|
|
IncludeExpired: false,
|
|
Now: observedAt,
|
|
})
|
|
if err != nil {
|
|
return false
|
|
}
|
|
for _, observation := range observations {
|
|
if observation.FeedbackStatus != "fenced" && observation.FeedbackStatus != "degraded" {
|
|
continue
|
|
}
|
|
if containsString(observation.Reasons, "data_plane_fabric_route_send_failed") ||
|
|
jsonString(jsonObject(observation.Payload), "source") == "fabric_service_channel_access_report" {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
type fabricServiceChannelRuntimeHeartbeat struct {
|
|
SchemaVersion string `json:"schema_version"`
|
|
ConfigVersion string `json:"config_version"`
|
|
Ingress struct {
|
|
FlowScheduler struct {
|
|
ChannelStats map[string]fabricServiceChannelRuntimeChannelStat `json:"channel_stats"`
|
|
} `json:"flow_scheduler"`
|
|
} `json:"ingress"`
|
|
}
|
|
|
|
type fabricServiceChannelRuntimeChannelStat struct {
|
|
LastRouteID string `json:"last_route_id"`
|
|
RoutePolicyVersion string `json:"route_policy_version,omitempty"`
|
|
RouteGeneration string `json:"route_generation,omitempty"`
|
|
RecoveryPolicyFingerprint string `json:"recovery_policy_fingerprint,omitempty"`
|
|
LastFailedRouteID string `json:"last_failed_route_id"`
|
|
LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"`
|
|
LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"`
|
|
LastError string `json:"last_error"`
|
|
ConsecutiveFailures int `json:"consecutive_failures"`
|
|
StallCount int `json:"stall_count"`
|
|
LastSendDurationMillis int64 `json:"last_send_duration_ms"`
|
|
RouteRebuildRecommended bool `json:"route_rebuild_recommended"`
|
|
DegradedFallbackRecommended bool `json:"degraded_fallback_recommended"`
|
|
QualityWindowSampleCount int `json:"quality_window_sample_count"`
|
|
QualityWindowSuccessCount int `json:"quality_window_success_count"`
|
|
QualityWindowFailureCount int `json:"quality_window_failure_count"`
|
|
QualityWindowSlowCount int `json:"quality_window_slow_count"`
|
|
QualityWindowDropCount int `json:"quality_window_drop_count"`
|
|
QualityWindowAvgLatencyMs int64 `json:"quality_window_avg_latency_ms"`
|
|
QualityWindowLastUpdatedAt string `json:"quality_window_last_updated_at"`
|
|
}
|
|
|
|
func fabricServiceChannelRouteFeedbackFromHeartbeat(heartbeat NodeHeartbeat, now time.Time) map[string]fabricServiceChannelRouteFeedback {
|
|
return fabricServiceChannelRouteFeedbackFromHeartbeatWithProvenance(heartbeat, now, defaultFabricServiceChannelRecoveryPolicy(), nil)
|
|
}
|
|
|
|
func fabricServiceChannelRouteFeedbackFromHeartbeatWithProvenance(heartbeat NodeHeartbeat, now time.Time, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) map[string]fabricServiceChannelRouteFeedback {
|
|
out := map[string]fabricServiceChannelRouteFeedback{}
|
|
for _, input := range fabricServiceChannelRouteFeedbackInputsFromHeartbeat(heartbeat, FabricServiceClassVPNPackets, now.Add(fabricServiceChannelFeedbackMaxAge)) {
|
|
observation := fabricServiceChannelAnnotateFeedbackProvenance(FabricServiceChannelRouteFeedbackObservation{
|
|
ClusterID: input.ClusterID,
|
|
ReporterNodeID: input.ReporterNodeID,
|
|
RouteID: input.RouteID,
|
|
ServiceClass: input.ServiceClass,
|
|
FeedbackStatus: input.FeedbackStatus,
|
|
ScoreAdjustment: input.ScoreAdjustment,
|
|
Reasons: append([]string{}, input.Reasons...),
|
|
LastError: input.LastError,
|
|
ConsecutiveFailures: input.ConsecutiveFailures,
|
|
StallCount: input.StallCount,
|
|
LastSendDurationMs: input.LastSendDurationMs,
|
|
Payload: input.Payload,
|
|
ObservedAt: input.ObservedAt,
|
|
ExpiresAt: input.ExpiresAt,
|
|
}, policy, routeProvenance)
|
|
scoreAdjustment := input.ScoreAdjustment
|
|
fenced := input.FeedbackStatus == "fenced"
|
|
routeRebuildRecommended := containsString(input.Reasons, "service_channel_route_rebuild_recommended")
|
|
degradedFallbackRecommended := containsString(input.Reasons, "service_channel_degraded_fallback_recommended")
|
|
if observation.StalePolicy || observation.StaleGeneration {
|
|
scoreAdjustment = fabricServiceChannelConservativeStaleScore(scoreAdjustment)
|
|
fenced = false
|
|
routeRebuildRecommended = false
|
|
degradedFallbackRecommended = false
|
|
}
|
|
item := fabricServiceChannelRouteFeedback{
|
|
RouteID: input.RouteID,
|
|
Fenced: fenced,
|
|
StalePolicy: observation.StalePolicy,
|
|
StaleGeneration: observation.StaleGeneration,
|
|
ProvenanceMissing: observation.ProvenanceMissing,
|
|
StaleReason: observation.StaleReason,
|
|
ScoreAdjustment: scoreAdjustment,
|
|
Reasons: observation.Reasons,
|
|
LastError: input.LastError,
|
|
ConsecutiveFailures: input.ConsecutiveFailures,
|
|
StallCount: input.StallCount,
|
|
LastSendDurationMs: input.LastSendDurationMs,
|
|
DegradedFallbackRecommended: degradedFallbackRecommended,
|
|
RouteRebuildRecommended: routeRebuildRecommended,
|
|
QualityWindowSampleCount: fabricServiceChannelFeedbackPayloadInt(input.Payload, "quality_window_sample_count"),
|
|
QualityWindowSuccessCount: fabricServiceChannelFeedbackPayloadInt(input.Payload, "quality_window_success_count"),
|
|
QualityWindowFailureCount: fabricServiceChannelFeedbackPayloadInt(input.Payload, "quality_window_failure_count"),
|
|
QualityWindowSlowCount: fabricServiceChannelFeedbackPayloadInt(input.Payload, "quality_window_slow_count"),
|
|
QualityWindowDropCount: fabricServiceChannelFeedbackPayloadInt(input.Payload, "quality_window_drop_count"),
|
|
ObservedAt: input.ObservedAt,
|
|
}
|
|
out[input.RouteID] = item
|
|
}
|
|
return out
|
|
}
|
|
|
|
func fabricServiceChannelRouteFeedbackInputsFromHeartbeat(heartbeat NodeHeartbeat, serviceClass string, expiresAt time.Time) []RecordFabricServiceChannelRouteFeedbackInput {
|
|
if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) {
|
|
return nil
|
|
}
|
|
var metadata struct {
|
|
Report fabricServiceChannelRuntimeHeartbeat `json:"fabric_service_channel_runtime_report"`
|
|
}
|
|
if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil {
|
|
return nil
|
|
}
|
|
if metadata.Report.SchemaVersion == "" || len(metadata.Report.Ingress.FlowScheduler.ChannelStats) == 0 {
|
|
return nil
|
|
}
|
|
observedAt := heartbeat.ObservedAt.UTC()
|
|
if observedAt.IsZero() {
|
|
observedAt = time.Now().UTC()
|
|
}
|
|
var out []RecordFabricServiceChannelRouteFeedbackInput
|
|
for _, stat := range metadata.Report.Ingress.FlowScheduler.ChannelStats {
|
|
failedRouteID := strings.TrimSpace(stat.LastFailedRouteID)
|
|
rollingFailureCount := fabricServiceChannelRollingFailureCount(stat)
|
|
rollingStallCount := fabricServiceChannelRollingStallCount(stat)
|
|
rollingLatencyMs := fabricServiceChannelRollingLatencyMs(stat)
|
|
rollingWindowActive := stat.QualityWindowSampleCount > 0
|
|
freshFailureActive := failedRouteID != "" && (!rollingWindowActive || rollingFailureCount > 0)
|
|
if freshFailureActive {
|
|
scoreAdjustment := -30
|
|
reasons := []string{"service_channel_recent_route_failure"}
|
|
if rollingWindowActive {
|
|
reasons = append(reasons, "service_channel_rolling_quality_window")
|
|
}
|
|
status := "degraded"
|
|
if stat.RouteRebuildRecommended || stat.DegradedFallbackRecommended || rollingFailureCount >= 2 {
|
|
status = "fenced"
|
|
scoreAdjustment -= 1000
|
|
reasons = append(reasons, "service_channel_route_rebuild_recommended")
|
|
if stat.DegradedFallbackRecommended {
|
|
reasons = append(reasons, "service_channel_degraded_fallback_recommended")
|
|
}
|
|
}
|
|
out = append(out, RecordFabricServiceChannelRouteFeedbackInput{
|
|
ClusterID: heartbeat.ClusterID,
|
|
ReporterNodeID: heartbeat.NodeID,
|
|
RouteID: failedRouteID,
|
|
ServiceClass: serviceClass,
|
|
FeedbackStatus: status,
|
|
ScoreAdjustment: scoreAdjustment,
|
|
Reasons: dedupeStrings(reasons),
|
|
LastError: strings.TrimSpace(stat.LastError),
|
|
ConsecutiveFailures: rollingFailureCount,
|
|
StallCount: rollingStallCount,
|
|
LastSendDurationMs: rollingLatencyMs,
|
|
Payload: fabricServiceChannelFeedbackPayload(stat, metadata.Report.ConfigVersion),
|
|
ObservedAt: observedAt,
|
|
ExpiresAt: expiresAt,
|
|
})
|
|
}
|
|
successRouteID := strings.TrimSpace(stat.LastRouteID)
|
|
if successRouteID != "" && (!freshFailureActive || successRouteID != failedRouteID) && fabricServiceChannelStatHasFreshSuccess(stat) {
|
|
qualityAdjustment, qualityReasons := fabricServiceChannelRouteQualityScore(rollingLatencyMs, rollingFailureCount, rollingStallCount)
|
|
reasons := append([]string{"service_channel_recent_success"}, qualityReasons...)
|
|
if rollingWindowActive {
|
|
reasons = append(reasons, "service_channel_rolling_quality_window")
|
|
}
|
|
out = append(out, RecordFabricServiceChannelRouteFeedbackInput{
|
|
ClusterID: heartbeat.ClusterID,
|
|
ReporterNodeID: heartbeat.NodeID,
|
|
RouteID: successRouteID,
|
|
ServiceClass: serviceClass,
|
|
FeedbackStatus: "healthy",
|
|
ScoreAdjustment: 10 + qualityAdjustment,
|
|
Reasons: dedupeStrings(reasons),
|
|
ConsecutiveFailures: rollingFailureCount,
|
|
StallCount: rollingStallCount,
|
|
LastSendDurationMs: rollingLatencyMs,
|
|
Payload: fabricServiceChannelFeedbackPayload(stat, metadata.Report.ConfigVersion),
|
|
ObservedAt: observedAt,
|
|
ExpiresAt: expiresAt,
|
|
})
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func fabricServiceChannelFeedbackPayload(stat fabricServiceChannelRuntimeChannelStat, configVersion string) json.RawMessage {
|
|
payload := map[string]any{}
|
|
rawStat, err := json.Marshal(stat)
|
|
if err == nil {
|
|
_ = json.Unmarshal(rawStat, &payload)
|
|
}
|
|
if strings.TrimSpace(configVersion) != "" {
|
|
payload["observed_config_version"] = strings.TrimSpace(configVersion)
|
|
}
|
|
raw, err := json.Marshal(payload)
|
|
if err != nil {
|
|
return json.RawMessage(`{}`)
|
|
}
|
|
return raw
|
|
}
|
|
|
|
func fabricServiceChannelStatHasFreshSuccess(stat fabricServiceChannelRuntimeChannelStat) bool {
|
|
if stat.QualityWindowSampleCount <= 0 {
|
|
return !stat.RouteRebuildRecommended && !stat.DegradedFallbackRecommended
|
|
}
|
|
return stat.QualityWindowSuccessCount > 0 && stat.QualityWindowFailureCount == 0 && stat.QualityWindowDropCount == 0
|
|
}
|
|
|
|
func fabricServiceChannelFlowSchedulerFromHeartbeat(heartbeat NodeHeartbeat) map[string]any {
|
|
if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) {
|
|
return map[string]any{}
|
|
}
|
|
metadata := jsonObject(heartbeat.Metadata)
|
|
return jsonMapPath(metadata, "fabric_service_channel_runtime_report", "ingress", "flow_scheduler")
|
|
}
|
|
|
|
func fabricServiceChannelRollingFailureCount(stat fabricServiceChannelRuntimeChannelStat) int {
|
|
if stat.QualityWindowSampleCount <= 0 {
|
|
return stat.ConsecutiveFailures
|
|
}
|
|
return stat.QualityWindowFailureCount + stat.QualityWindowDropCount
|
|
}
|
|
|
|
func fabricServiceChannelRollingStallCount(stat fabricServiceChannelRuntimeChannelStat) int {
|
|
if stat.QualityWindowSampleCount <= 0 {
|
|
return stat.StallCount
|
|
}
|
|
return stat.QualityWindowSlowCount
|
|
}
|
|
|
|
func fabricServiceChannelRollingLatencyMs(stat fabricServiceChannelRuntimeChannelStat) int64 {
|
|
if stat.QualityWindowSampleCount > 0 && stat.QualityWindowAvgLatencyMs > 0 {
|
|
return stat.QualityWindowAvgLatencyMs
|
|
}
|
|
return stat.LastSendDurationMillis
|
|
}
|
|
|
|
func fabricServiceChannelRouteQualityScore(lastSendDurationMs int64, consecutiveFailures int, stallCount int) (int, []string) {
|
|
score := 0
|
|
reasons := []string{}
|
|
switch {
|
|
case lastSendDurationMs <= 0:
|
|
case lastSendDurationMs <= 10:
|
|
score += 80
|
|
reasons = append(reasons, "service_channel_quality_latency_le_10ms")
|
|
case lastSendDurationMs <= 25:
|
|
score += 60
|
|
reasons = append(reasons, "service_channel_quality_latency_le_25ms")
|
|
case lastSendDurationMs <= 50:
|
|
score += 40
|
|
reasons = append(reasons, "service_channel_quality_latency_le_50ms")
|
|
case lastSendDurationMs <= 100:
|
|
score += 20
|
|
reasons = append(reasons, "service_channel_quality_latency_le_100ms")
|
|
case lastSendDurationMs <= 250:
|
|
score += 5
|
|
reasons = append(reasons, "service_channel_quality_latency_le_250ms")
|
|
case lastSendDurationMs <= 500:
|
|
score -= 10
|
|
reasons = append(reasons, "service_channel_quality_latency_slow")
|
|
case lastSendDurationMs <= 1000:
|
|
score -= 30
|
|
reasons = append(reasons, "service_channel_quality_latency_very_slow")
|
|
default:
|
|
score -= 60
|
|
reasons = append(reasons, "service_channel_quality_latency_unhealthy")
|
|
}
|
|
if consecutiveFailures > 0 {
|
|
penalty := consecutiveFailures * 20
|
|
if penalty > 100 {
|
|
penalty = 100
|
|
}
|
|
score -= penalty
|
|
reasons = append(reasons, "service_channel_quality_recent_failures")
|
|
}
|
|
if stallCount > 0 {
|
|
penalty := stallCount * 5
|
|
if penalty > 50 {
|
|
penalty = 50
|
|
}
|
|
score -= penalty
|
|
reasons = append(reasons, "service_channel_quality_recent_stalls")
|
|
}
|
|
return score, dedupeStrings(reasons)
|
|
}
|
|
|
|
func fabricServiceChannelRetryCooldownUntil(payload json.RawMessage) *time.Time {
|
|
if len(payload) == 0 || !json.Valid(payload) {
|
|
return nil
|
|
}
|
|
var raw map[string]any
|
|
if err := json.Unmarshal(payload, &raw); err != nil {
|
|
return nil
|
|
}
|
|
value, ok := raw["operator_retry_cooldown_until"].(string)
|
|
if !ok || strings.TrimSpace(value) == "" {
|
|
return nil
|
|
}
|
|
parsed, err := time.Parse(time.RFC3339Nano, strings.TrimSpace(value))
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
parsed = parsed.UTC()
|
|
return &parsed
|
|
}
|
|
|
|
func fabricServiceChannelFeedbackPayloadBool(payload json.RawMessage, key string) bool {
|
|
if len(payload) == 0 || !json.Valid(payload) {
|
|
return false
|
|
}
|
|
var raw map[string]any
|
|
if err := json.Unmarshal(payload, &raw); err != nil {
|
|
return false
|
|
}
|
|
value, ok := raw[key].(bool)
|
|
return ok && value
|
|
}
|
|
|
|
func fabricServiceChannelFeedbackPayloadInt(payload json.RawMessage, key string) int {
|
|
if len(payload) == 0 || !json.Valid(payload) {
|
|
return 0
|
|
}
|
|
var raw map[string]any
|
|
if err := json.Unmarshal(payload, &raw); err != nil {
|
|
return 0
|
|
}
|
|
switch value := raw[key].(type) {
|
|
case float64:
|
|
return int(value)
|
|
case int:
|
|
return value
|
|
case json.Number:
|
|
parsed, _ := value.Int64()
|
|
return int(parsed)
|
|
default:
|
|
return 0
|
|
}
|
|
}
|
|
|
|
func fabricServiceChannelFeedbackPayloadString(payload json.RawMessage, keys ...string) string {
|
|
if len(payload) == 0 || !json.Valid(payload) {
|
|
return ""
|
|
}
|
|
var raw map[string]any
|
|
if err := json.Unmarshal(payload, &raw); err != nil {
|
|
return ""
|
|
}
|
|
for _, key := range keys {
|
|
if value, ok := raw[key].(string); ok && strings.TrimSpace(value) != "" {
|
|
return strings.TrimSpace(value)
|
|
}
|
|
}
|
|
if nested, ok := raw["recovery_policy"].(map[string]any); ok {
|
|
for _, key := range keys {
|
|
if value, ok := nested[key].(string); ok && strings.TrimSpace(value) != "" {
|
|
return strings.TrimSpace(value)
|
|
}
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func fabricServiceChannelAnnotateFeedbackProvenance(observation FabricServiceChannelRouteFeedbackObservation, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) FabricServiceChannelRouteFeedbackObservation {
|
|
policy = normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy())
|
|
observation.EffectivePolicyFingerprint = policy.Fingerprint
|
|
observation.ObservedPolicyFingerprint = fabricServiceChannelFeedbackPayloadString(observation.Payload, "recovery_policy_fingerprint", "policy_fingerprint", "fingerprint")
|
|
provenance := routeProvenance[observation.RouteID]
|
|
observation.EffectiveRouteGeneration = provenance.RouteGeneration
|
|
observation.ObservedRouteGeneration = fabricServiceChannelFeedbackPayloadString(observation.Payload, "route_generation", "route_policy_version", "policy_version")
|
|
missingPolicy := observation.ObservedPolicyFingerprint == ""
|
|
missingGeneration := observation.ObservedRouteGeneration == "" && observation.EffectiveRouteGeneration != ""
|
|
observation.ProvenanceMissing = missingPolicy || missingGeneration
|
|
if observation.ObservedPolicyFingerprint != "" && policy.Fingerprint != "" && observation.ObservedPolicyFingerprint != policy.Fingerprint {
|
|
observation.StalePolicy = true
|
|
}
|
|
if observation.ObservedRouteGeneration != "" && observation.EffectiveRouteGeneration != "" && observation.ObservedRouteGeneration != observation.EffectiveRouteGeneration {
|
|
observation.StaleGeneration = true
|
|
}
|
|
switch {
|
|
case observation.StalePolicy && observation.StaleGeneration:
|
|
observation.StaleReason = "service_channel_feedback_stale_policy_and_generation"
|
|
case observation.StalePolicy:
|
|
observation.StaleReason = "service_channel_feedback_stale_policy"
|
|
case observation.StaleGeneration:
|
|
observation.StaleReason = "service_channel_feedback_stale_generation"
|
|
case observation.ProvenanceMissing:
|
|
observation.StaleReason = "service_channel_feedback_provenance_missing"
|
|
}
|
|
if observation.StaleReason != "" {
|
|
observation.Reasons = dedupeStrings(append(observation.Reasons, observation.StaleReason))
|
|
}
|
|
return observation
|
|
}
|
|
|
|
func fabricServiceChannelConservativeStaleScore(score int) int {
|
|
if score > 0 {
|
|
return 0
|
|
}
|
|
if score < -10 {
|
|
return -10
|
|
}
|
|
return score
|
|
}
|
|
|
|
func fabricServiceChannelFeedbackSuppressedByOperatorCooldown(input RecordFabricServiceChannelRouteFeedbackInput, cooldownUntil, observedAt time.Time) RecordFabricServiceChannelRouteFeedbackInput {
|
|
originalStatus := input.FeedbackStatus
|
|
originalScore := input.ScoreAdjustment
|
|
payload := map[string]any{}
|
|
if len(input.Payload) > 0 && json.Valid(input.Payload) {
|
|
_ = json.Unmarshal(input.Payload, &payload)
|
|
}
|
|
payload["operator_feedback_suppressed"] = true
|
|
payload["operator_suppressed_feedback_status"] = originalStatus
|
|
payload["operator_suppressed_score_adjustment"] = originalScore
|
|
payload["operator_retry_cooldown_until"] = cooldownUntil.UTC().Format(time.RFC3339Nano)
|
|
payload["operator_suppressed_at"] = observedAt.UTC().Format(time.RFC3339Nano)
|
|
raw, err := json.Marshal(payload)
|
|
if err != nil {
|
|
raw = []byte(`{}`)
|
|
}
|
|
input.FeedbackStatus = "operator_retry_cooldown"
|
|
input.ScoreAdjustment = 0
|
|
input.Reasons = dedupeStrings(append(input.Reasons, "operator_expired_feedback_retry", "manual_feedback_expired_retry_cooldown", "service_channel_feedback_suppressed_by_operator_expire"))
|
|
input.Payload = raw
|
|
input.ExpiresAt = cooldownUntil.UTC()
|
|
return input
|
|
}
|
|
|
|
func fabricServiceChannelRouteFeedbackFromObservations(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time) map[string]fabricServiceChannelRouteFeedback {
|
|
return fabricServiceChannelRouteFeedbackFromObservationsWithProvenance(observations, now, defaultFabricServiceChannelRecoveryPolicy(), nil)
|
|
}
|
|
|
|
func fabricServiceChannelRouteFeedbackFromObservationsWithProvenance(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) map[string]fabricServiceChannelRouteFeedback {
|
|
out := map[string]fabricServiceChannelRouteFeedback{}
|
|
for _, observation := range observations {
|
|
observation = fabricServiceChannelAnnotateFeedbackProvenance(observation, policy, routeProvenance)
|
|
if strings.TrimSpace(observation.RouteID) == "" ||
|
|
(!observation.ExpiresAt.IsZero() && !observation.ExpiresAt.After(now.UTC())) {
|
|
continue
|
|
}
|
|
item := out[observation.RouteID]
|
|
item.RouteID = observation.RouteID
|
|
stale := observation.StalePolicy || observation.StaleGeneration
|
|
item.StalePolicy = item.StalePolicy || observation.StalePolicy
|
|
item.StaleGeneration = item.StaleGeneration || observation.StaleGeneration
|
|
item.ProvenanceMissing = item.ProvenanceMissing || observation.ProvenanceMissing
|
|
if observation.StaleReason != "" {
|
|
item.StaleReason = observation.StaleReason
|
|
}
|
|
item.Fenced = item.Fenced || (!stale && observation.FeedbackStatus == "fenced")
|
|
if observation.RetryCooldownUntil != nil && observation.RetryCooldownUntil.After(now.UTC()) {
|
|
item.ManualRetry = true
|
|
}
|
|
scoreAdjustment, ageDecayReasons := fabricServiceChannelFeedbackScoreWithAgeDecay(observation, now)
|
|
if stale {
|
|
scoreAdjustment = fabricServiceChannelConservativeStaleScore(scoreAdjustment)
|
|
}
|
|
item.ScoreAdjustment += scoreAdjustment
|
|
item.Reasons = append(item.Reasons, observation.Reasons...)
|
|
item.Reasons = append(item.Reasons, ageDecayReasons...)
|
|
if observation.LastSendDurationMs > 0 && (item.LastSendDurationMs == 0 || observation.LastSendDurationMs < item.LastSendDurationMs) {
|
|
item.LastSendDurationMs = observation.LastSendDurationMs
|
|
}
|
|
if observation.ConsecutiveFailures > item.ConsecutiveFailures {
|
|
item.ConsecutiveFailures = observation.ConsecutiveFailures
|
|
}
|
|
if observation.StallCount > item.StallCount {
|
|
item.StallCount = observation.StallCount
|
|
}
|
|
item.DegradedFallbackRecommended = item.DegradedFallbackRecommended || (!stale &&
|
|
(containsString(observation.Reasons, "service_channel_degraded_fallback_recommended") ||
|
|
fabricServiceChannelFeedbackPayloadBool(observation.Payload, "degraded_fallback_recommended")))
|
|
item.RouteRebuildRecommended = item.RouteRebuildRecommended || (!stale &&
|
|
(containsString(observation.Reasons, "service_channel_route_rebuild_recommended") ||
|
|
fabricServiceChannelFeedbackPayloadBool(observation.Payload, "route_rebuild_recommended")))
|
|
if sampleCount := fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_sample_count"); sampleCount > item.QualityWindowSampleCount {
|
|
item.QualityWindowSampleCount = sampleCount
|
|
}
|
|
if successCount := fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_success_count"); successCount > item.QualityWindowSuccessCount {
|
|
item.QualityWindowSuccessCount = successCount
|
|
}
|
|
if failureCount := fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_failure_count"); failureCount > item.QualityWindowFailureCount {
|
|
item.QualityWindowFailureCount = failureCount
|
|
}
|
|
if slowCount := fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_slow_count"); slowCount > item.QualityWindowSlowCount {
|
|
item.QualityWindowSlowCount = slowCount
|
|
}
|
|
if dropCount := fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_drop_count"); dropCount > item.QualityWindowDropCount {
|
|
item.QualityWindowDropCount = dropCount
|
|
}
|
|
if observation.LastError != "" {
|
|
item.LastError = observation.LastError
|
|
}
|
|
if observation.ObservedAt.After(item.ObservedAt) {
|
|
item.ObservedAt = observation.ObservedAt
|
|
item.ExpiresAt = observation.ExpiresAt
|
|
item.ObservationID = observation.ID
|
|
item.Source = jsonString(jsonObject(observation.Payload), "source")
|
|
item.ChannelID = jsonString(jsonObject(observation.Payload), "channel_id")
|
|
item.ResourceID = jsonString(jsonObject(observation.Payload), "resource_id")
|
|
item.ViolationStatus = jsonString(jsonObject(observation.Payload), "last_data_plane_violation_status")
|
|
item.ViolationReason = jsonString(jsonObject(observation.Payload), "last_data_plane_violation_reason")
|
|
}
|
|
if observation.RetryCooldownUntil != nil && (item.RetryCooldownUntil == nil || observation.RetryCooldownUntil.After(*item.RetryCooldownUntil)) {
|
|
cooldown := observation.RetryCooldownUntil.UTC()
|
|
item.RetryCooldownUntil = &cooldown
|
|
}
|
|
out[observation.RouteID] = item
|
|
}
|
|
for routeID, item := range out {
|
|
item.Reasons = dedupeStrings(item.Reasons)
|
|
out[routeID] = item
|
|
}
|
|
return out
|
|
}
|
|
|
|
func fabricServiceChannelManualRetryFeedbackFromObservations(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time) map[string]fabricServiceChannelRouteFeedback {
|
|
return fabricServiceChannelManualRetryFeedbackFromObservationsWithProvenance(observations, now, defaultFabricServiceChannelRecoveryPolicy(), nil)
|
|
}
|
|
|
|
func fabricServiceChannelManualRetryFeedbackFromObservationsWithProvenance(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) map[string]fabricServiceChannelRouteFeedback {
|
|
out := map[string]fabricServiceChannelRouteFeedback{}
|
|
now = now.UTC()
|
|
for _, observation := range observations {
|
|
observation = fabricServiceChannelAnnotateFeedbackProvenance(observation, policy, routeProvenance)
|
|
if strings.TrimSpace(observation.RouteID) == "" || observation.RetryCooldownUntil == nil || !observation.RetryCooldownUntil.After(now) {
|
|
continue
|
|
}
|
|
if observation.FeedbackStatus == "healthy" {
|
|
continue
|
|
}
|
|
item := out[observation.RouteID]
|
|
item.RouteID = observation.RouteID
|
|
item.ManualRetry = true
|
|
item.StalePolicy = item.StalePolicy || observation.StalePolicy
|
|
item.StaleGeneration = item.StaleGeneration || observation.StaleGeneration
|
|
item.ProvenanceMissing = item.ProvenanceMissing || observation.ProvenanceMissing
|
|
if observation.StaleReason != "" {
|
|
item.StaleReason = observation.StaleReason
|
|
}
|
|
item.ScoreAdjustment += 0
|
|
item.Reasons = append(item.Reasons, "operator_expired_feedback_retry", "manual_feedback_expired_retry_cooldown")
|
|
if observation.LastError != "" {
|
|
item.LastError = observation.LastError
|
|
}
|
|
if observation.ObservedAt.After(item.ObservedAt) {
|
|
item.ObservedAt = observation.ObservedAt
|
|
}
|
|
cooldown := observation.RetryCooldownUntil.UTC()
|
|
if item.RetryCooldownUntil == nil || cooldown.After(*item.RetryCooldownUntil) {
|
|
item.RetryCooldownUntil = &cooldown
|
|
}
|
|
out[observation.RouteID] = item
|
|
}
|
|
for routeID, item := range out {
|
|
item.Reasons = dedupeStrings(item.Reasons)
|
|
out[routeID] = item
|
|
}
|
|
return out
|
|
}
|
|
|
|
func fabricServiceChannelFeedbackScoreWithAgeDecay(observation FabricServiceChannelRouteFeedbackObservation, now time.Time) (int, []string) {
|
|
score := observation.ScoreAdjustment
|
|
if score <= 0 || observation.FeedbackStatus != "healthy" || observation.ObservedAt.IsZero() {
|
|
return score, nil
|
|
}
|
|
observedAt := observation.ObservedAt.UTC()
|
|
now = now.UTC()
|
|
if !now.After(observedAt) {
|
|
return score, nil
|
|
}
|
|
maxAge := fabricServiceChannelFeedbackMaxAge
|
|
if !observation.ExpiresAt.IsZero() && observation.ExpiresAt.After(observedAt) {
|
|
maxAge = observation.ExpiresAt.Sub(observedAt)
|
|
}
|
|
if maxAge <= 0 {
|
|
return 0, []string{"service_channel_feedback_age_decay_expired"}
|
|
}
|
|
age := now.Sub(observedAt)
|
|
if age <= 0 {
|
|
return score, nil
|
|
}
|
|
if age >= maxAge {
|
|
return 0, []string{"service_channel_feedback_age_decay_expired"}
|
|
}
|
|
remaining := maxAge - age
|
|
decayed := int((int64(score)*int64(remaining) + int64(maxAge) - 1) / int64(maxAge))
|
|
if decayed < 1 {
|
|
decayed = 1
|
|
}
|
|
if decayed == score {
|
|
return score, nil
|
|
}
|
|
return decayed, []string{"service_channel_feedback_age_decay"}
|
|
}
|
|
|
|
func mergeFabricServiceChannelRouteFeedback(dst map[string]fabricServiceChannelRouteFeedback, src map[string]fabricServiceChannelRouteFeedback) {
|
|
for routeID, incoming := range src {
|
|
existing := dst[routeID]
|
|
existing.RouteID = routeID
|
|
existing.Fenced = existing.Fenced || incoming.Fenced
|
|
existing.ManualRetry = existing.ManualRetry || incoming.ManualRetry
|
|
existing.StalePolicy = existing.StalePolicy || incoming.StalePolicy
|
|
existing.StaleGeneration = existing.StaleGeneration || incoming.StaleGeneration
|
|
existing.ProvenanceMissing = existing.ProvenanceMissing || incoming.ProvenanceMissing
|
|
if incoming.StaleReason != "" {
|
|
existing.StaleReason = incoming.StaleReason
|
|
}
|
|
existing.ScoreAdjustment += incoming.ScoreAdjustment
|
|
existing.Reasons = dedupeStrings(append(existing.Reasons, incoming.Reasons...))
|
|
if incoming.ConsecutiveFailures > existing.ConsecutiveFailures {
|
|
existing.ConsecutiveFailures = incoming.ConsecutiveFailures
|
|
}
|
|
if incoming.StallCount > existing.StallCount {
|
|
existing.StallCount = incoming.StallCount
|
|
}
|
|
if incoming.LastSendDurationMs > 0 && (existing.LastSendDurationMs == 0 || incoming.LastSendDurationMs < existing.LastSendDurationMs) {
|
|
existing.LastSendDurationMs = incoming.LastSendDurationMs
|
|
}
|
|
existing.DegradedFallbackRecommended = existing.DegradedFallbackRecommended || incoming.DegradedFallbackRecommended
|
|
existing.RouteRebuildRecommended = existing.RouteRebuildRecommended || incoming.RouteRebuildRecommended
|
|
if incoming.QualityWindowSampleCount > existing.QualityWindowSampleCount {
|
|
existing.QualityWindowSampleCount = incoming.QualityWindowSampleCount
|
|
}
|
|
if incoming.QualityWindowSuccessCount > existing.QualityWindowSuccessCount {
|
|
existing.QualityWindowSuccessCount = incoming.QualityWindowSuccessCount
|
|
}
|
|
if incoming.QualityWindowFailureCount > existing.QualityWindowFailureCount {
|
|
existing.QualityWindowFailureCount = incoming.QualityWindowFailureCount
|
|
}
|
|
if incoming.QualityWindowSlowCount > existing.QualityWindowSlowCount {
|
|
existing.QualityWindowSlowCount = incoming.QualityWindowSlowCount
|
|
}
|
|
if incoming.QualityWindowDropCount > existing.QualityWindowDropCount {
|
|
existing.QualityWindowDropCount = incoming.QualityWindowDropCount
|
|
}
|
|
if incoming.LastError != "" {
|
|
existing.LastError = incoming.LastError
|
|
}
|
|
if incoming.ObservedAt.After(existing.ObservedAt) {
|
|
existing.ObservedAt = incoming.ObservedAt
|
|
}
|
|
if incoming.RetryCooldownUntil != nil && (existing.RetryCooldownUntil == nil || incoming.RetryCooldownUntil.After(*existing.RetryCooldownUntil)) {
|
|
cooldown := incoming.RetryCooldownUntil.UTC()
|
|
existing.RetryCooldownUntil = &cooldown
|
|
}
|
|
dst[routeID] = existing
|
|
}
|
|
}
|
|
|
|
func serviceChannelRouteFeedbackReport(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time) *FabricServiceChannelRouteFeedbackReport {
|
|
return serviceChannelRouteFeedbackReportWithPolicy(observations, now, defaultFabricServiceChannelRecoveryPolicy())
|
|
}
|
|
|
|
func serviceChannelRouteFeedbackReportWithPolicy(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy) *FabricServiceChannelRouteFeedbackReport {
|
|
return serviceChannelRouteFeedbackReportWithPolicyAndProvenance(observations, now, policy, nil)
|
|
}
|
|
|
|
func serviceChannelRouteFeedbackReportWithPolicyAndProvenance(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) *FabricServiceChannelRouteFeedbackReport {
|
|
policy = normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy())
|
|
reportObservations := make([]FabricServiceChannelRouteFeedbackObservation, 0, len(observations))
|
|
for _, observation := range observations {
|
|
observation = fabricServiceChannelAnnotateFeedbackProvenance(observation, policy, routeProvenance)
|
|
effectiveScore, ageDecayReasons := fabricServiceChannelFeedbackScoreWithAgeDecay(observation, now)
|
|
if observation.StalePolicy || observation.StaleGeneration {
|
|
effectiveScore = fabricServiceChannelConservativeStaleScore(effectiveScore)
|
|
}
|
|
observation.EffectiveScoreAdjustment = effectiveScore
|
|
observation.Reasons = dedupeStrings(append(observation.Reasons, ageDecayReasons...))
|
|
observation.RecoveryState = fabricServiceChannelFeedbackObservationRecoveryState(observation, now)
|
|
observation.RecoveryPromoted = fabricServiceChannelFeedbackObservationRecoveryPromoted(observation, now, policy)
|
|
if observation.RecoveryPromoted {
|
|
observation.RecoveryState = "healthy"
|
|
}
|
|
observation.RecoveryDemoted, observation.RecoveryReason = fabricServiceChannelFeedbackObservationRecoveryDemotion(observation, now, policy)
|
|
observation.RecoveryHysteresisActive = observation.RecoveryState == "recovered"
|
|
if observation.RecoveryHysteresisActive {
|
|
observation.RecoveryHysteresisPenalty = policy.HysteresisPenalty
|
|
}
|
|
reportObservations = append(reportObservations, observation)
|
|
}
|
|
report := &FabricServiceChannelRouteFeedbackReport{
|
|
SchemaVersion: "rap.fabric_service_channel_route_feedback_report.v1",
|
|
GeneratedAt: now.UTC(),
|
|
FeedbackMaxAgeSeconds: int(fabricServiceChannelFeedbackMaxAge.Seconds()),
|
|
RecoveryPolicy: fabricServiceChannelRecoveryPolicyRef(policy),
|
|
ObservationCount: len(observations),
|
|
Observations: reportObservations,
|
|
}
|
|
for _, observation := range reportObservations {
|
|
switch strings.ToLower(strings.TrimSpace(observation.FeedbackStatus)) {
|
|
case "fenced":
|
|
report.FencedRouteCount++
|
|
case "degraded":
|
|
report.DegradedRouteCount++
|
|
case "healthy":
|
|
report.HealthyRouteCount++
|
|
}
|
|
if observation.RecoveryState == "recovered" {
|
|
report.RecoveredRouteCount++
|
|
}
|
|
if observation.RecoveryHysteresisActive {
|
|
report.RecoveryHysteresisCount++
|
|
}
|
|
if observation.RecoveryPromoted {
|
|
report.RecoveryPromotedCount++
|
|
}
|
|
if observation.RecoveryDemoted {
|
|
report.RecoveryDemotedCount++
|
|
}
|
|
if observation.ProvenanceMissing {
|
|
report.MissingProvenanceCount++
|
|
}
|
|
if observation.StalePolicy {
|
|
report.StalePolicyCount++
|
|
}
|
|
if observation.StaleGeneration {
|
|
report.StaleGenerationCount++
|
|
}
|
|
}
|
|
return report
|
|
}
|
|
|
|
func fabricServiceChannelFeedbackObservationRecoveryState(observation FabricServiceChannelRouteFeedbackObservation, now time.Time) string {
|
|
switch strings.ToLower(strings.TrimSpace(observation.FeedbackStatus)) {
|
|
case "fenced":
|
|
return "fenced"
|
|
case "degraded":
|
|
return "degraded"
|
|
case "healthy":
|
|
if observation.RetryCooldownUntil != nil &&
|
|
observation.RetryCooldownUntil.After(now.UTC()) &&
|
|
containsString(observation.Reasons, "service_channel_rolling_quality_window") {
|
|
return "recovered"
|
|
}
|
|
return "healthy"
|
|
default:
|
|
if observation.RetryCooldownUntil != nil && observation.RetryCooldownUntil.After(now.UTC()) {
|
|
return "cooldown"
|
|
}
|
|
return ""
|
|
}
|
|
}
|
|
|
|
func fabricServiceChannelFeedbackObservationRecoveryPromoted(observation FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy) bool {
|
|
if observation.RetryCooldownUntil == nil || !observation.RetryCooldownUntil.After(now.UTC()) {
|
|
return false
|
|
}
|
|
if strings.ToLower(strings.TrimSpace(observation.FeedbackStatus)) != "healthy" ||
|
|
!containsString(observation.Reasons, "service_channel_rolling_quality_window") {
|
|
return false
|
|
}
|
|
return fabricServiceChannelFeedbackCleanRollingSamples(
|
|
fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_sample_count"),
|
|
fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_success_count"),
|
|
fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_failure_count"),
|
|
fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_slow_count"),
|
|
fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_drop_count"),
|
|
policy,
|
|
)
|
|
}
|
|
|
|
func fabricServiceChannelFeedbackObservationRecoveryDemotion(observation FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy) (bool, string) {
|
|
if observation.RetryCooldownUntil == nil || !observation.RetryCooldownUntil.After(now.UTC()) {
|
|
return false, ""
|
|
}
|
|
if observation.RecoveryPromoted {
|
|
return false, ""
|
|
}
|
|
if policy.DemotionFencedEnabled && strings.ToLower(strings.TrimSpace(observation.FeedbackStatus)) == "fenced" {
|
|
return true, "service_channel_recovery_demoted_fenced"
|
|
}
|
|
if policy.DemotionRebuildEnabled && (containsString(observation.Reasons, "service_channel_route_rebuild_recommended") ||
|
|
fabricServiceChannelFeedbackPayloadBool(observation.Payload, "route_rebuild_recommended")) {
|
|
return true, "service_channel_recovery_demoted_rebuild"
|
|
}
|
|
if fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_failure_count") >= policy.DemotionFailureThreshold ||
|
|
fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_drop_count") >= policy.DemotionDropThreshold {
|
|
return true, "service_channel_recovery_demoted_failure"
|
|
}
|
|
if fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_slow_count") >= policy.DemotionSlowThreshold {
|
|
return true, "service_channel_recovery_demoted_slow"
|
|
}
|
|
if strings.ToLower(strings.TrimSpace(observation.FeedbackStatus)) == "degraded" {
|
|
return true, "service_channel_recovery_demoted_degraded"
|
|
}
|
|
return false, ""
|
|
}
|
|
|
|
func fabricServiceChannelRoutesFromIntents(intents []MeshRouteIntent, serviceClass string, entryPool, exitPool, allowedChannels []string, generation string, now, defaultExpiresAt time.Time, feedback map[string]fabricServiceChannelRouteFeedback, policy FabricServiceChannelRecoveryPolicy) []FabricServiceChannelRoute {
|
|
policy = normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy())
|
|
routes := []FabricServiceChannelRoute{}
|
|
for _, intent := range intents {
|
|
route, ok := fabricServiceChannelRouteFromIntent(intent, serviceClass, entryPool, exitPool, allowedChannels, generation, now, defaultExpiresAt, feedback, policy)
|
|
if ok {
|
|
routes = append(routes, route)
|
|
}
|
|
}
|
|
sort.SliceStable(routes, func(i, j int) bool {
|
|
if routes[i].Status != routes[j].Status {
|
|
return routes[i].Status == "authorized"
|
|
}
|
|
if routes[i].PathScore != routes[j].PathScore {
|
|
return routes[i].PathScore > routes[j].PathScore
|
|
}
|
|
if len(routes[i].Hops) != len(routes[j].Hops) {
|
|
return len(routes[i].Hops) < len(routes[j].Hops)
|
|
}
|
|
return routes[i].RouteID < routes[j].RouteID
|
|
})
|
|
return routes
|
|
}
|
|
|
|
func fabricServiceChannelRouteFromIntent(intent MeshRouteIntent, serviceClass string, entryPool, exitPool, requestedChannels []string, generation string, now, defaultExpiresAt time.Time, feedback map[string]fabricServiceChannelRouteFeedback, recoveryPolicy FabricServiceChannelRecoveryPolicy) (FabricServiceChannelRoute, bool) {
|
|
recoveryPolicy = normalizeFabricServiceChannelRecoveryPolicy(recoveryPolicy, defaultFabricServiceChannelRecoveryPolicy())
|
|
if intent.Status != "active" || strings.TrimSpace(intent.ServiceClass) != serviceClass {
|
|
return FabricServiceChannelRoute{}, false
|
|
}
|
|
var policy syntheticRoutePolicy
|
|
if err := json.Unmarshal(intent.Policy, &policy); err != nil {
|
|
return FabricServiceChannelRoute{}, false
|
|
}
|
|
if policy.ExpiresAt != nil && !policy.ExpiresAt.After(now.UTC()) {
|
|
return FabricServiceChannelRoute{}, false
|
|
}
|
|
var source nodeSelector
|
|
var destination nodeSelector
|
|
_ = json.Unmarshal(intent.SourceSelector, &source)
|
|
_ = json.Unmarshal(intent.DestinationSelector, &destination)
|
|
sourceNodeID := firstNodeID(source)
|
|
destinationNodeID := firstNodeID(destination)
|
|
hops := append([]string{}, policy.Hops...)
|
|
if len(hops) == 0 && sourceNodeID != "" && destinationNodeID != "" {
|
|
hops = []string{sourceNodeID, destinationNodeID}
|
|
}
|
|
if len(hops) < 2 {
|
|
return FabricServiceChannelRoute{}, false
|
|
}
|
|
if sourceNodeID == "" {
|
|
sourceNodeID = hops[0]
|
|
}
|
|
if destinationNodeID == "" {
|
|
destinationNodeID = hops[len(hops)-1]
|
|
}
|
|
if !containsString(entryPool, sourceNodeID) || !containsString(exitPool, destinationNodeID) {
|
|
return FabricServiceChannelRoute{}, false
|
|
}
|
|
allowedChannels := policy.AllowedChannels
|
|
if len(allowedChannels) == 0 {
|
|
allowedChannels = requestedChannels
|
|
}
|
|
if !fabricChannelsIntersect(allowedChannels, requestedChannels) {
|
|
return FabricServiceChannelRoute{}, false
|
|
}
|
|
expiresAt := defaultExpiresAt
|
|
if policy.ExpiresAt != nil {
|
|
expiresAt = policy.ExpiresAt.UTC()
|
|
}
|
|
routeVersion := policy.RouteVersion
|
|
if routeVersion == "" {
|
|
routeVersion = intent.UpdatedAt.UTC().Format(time.RFC3339)
|
|
}
|
|
policyVersion := policy.PolicyVersion
|
|
if policyVersion == "" {
|
|
policyVersion = routeVersion
|
|
}
|
|
score := 100 - len(hops)*5 + intent.Priority
|
|
if score < 1 {
|
|
score = 1
|
|
}
|
|
status := "authorized"
|
|
recoveryState := ""
|
|
recoveryPenalty := 0
|
|
recoveryPromoted := false
|
|
recoveryDemoted := false
|
|
recoveryReason := ""
|
|
scoreReasons := []string{"active_route_intent", "entry_exit_pool_match"}
|
|
if item, ok := feedback[intent.ID]; ok {
|
|
score += item.ScoreAdjustment
|
|
scoreReasons = append(scoreReasons, item.Reasons...)
|
|
if item.StalePolicy || item.StaleGeneration {
|
|
recoveryReason = item.StaleReason
|
|
if recoveryReason == "" {
|
|
recoveryReason = "service_channel_feedback_stale"
|
|
}
|
|
scoreReasons = append(scoreReasons, "service_channel_feedback_stale", recoveryReason)
|
|
}
|
|
if fabricServiceChannelFeedbackRecoveryDemoted(item, recoveryPolicy) {
|
|
recoveryDemoted = true
|
|
recoveryReason = fabricServiceChannelFeedbackRecoveryDemotionReason(item, recoveryPolicy)
|
|
scoreReasons = append(scoreReasons, "service_channel_recovery_demoted", recoveryReason)
|
|
}
|
|
if item.Fenced {
|
|
status = "fenced_by_service_channel_feedback"
|
|
recoveryState = "fenced"
|
|
score = 0
|
|
} else if score < 1 {
|
|
score = 1
|
|
}
|
|
if status == "authorized" && fabricServiceChannelFeedbackRecoveryPromoted(item, recoveryPolicy) {
|
|
recoveryState = "healthy"
|
|
recoveryPromoted = true
|
|
scoreReasons = append(scoreReasons, "service_channel_recovery_promoted")
|
|
} else if status == "authorized" && fabricServiceChannelFeedbackRecoveryHysteresisActive(item, recoveryPolicy) {
|
|
recoveryState = "recovered"
|
|
recoveryPenalty = recoveryPolicy.HysteresisPenalty
|
|
score -= recoveryPenalty
|
|
if score < 1 {
|
|
score = 1
|
|
}
|
|
scoreReasons = append(scoreReasons, "service_channel_recovery_hysteresis")
|
|
} else if status == "authorized" && item.ScoreAdjustment > 0 {
|
|
recoveryState = "healthy"
|
|
}
|
|
}
|
|
return FabricServiceChannelRoute{
|
|
RouteID: intent.ID,
|
|
ClusterID: intent.ClusterID,
|
|
ServiceClass: serviceClass,
|
|
SourceNodeID: sourceNodeID,
|
|
DestinationNodeID: destinationNodeID,
|
|
Hops: hops,
|
|
AllowedChannels: allowedChannels,
|
|
RouteVersion: routeVersion,
|
|
PolicyVersion: policyVersion,
|
|
Generation: generation,
|
|
Status: status,
|
|
RecoveryState: recoveryState,
|
|
RecoveryPenalty: recoveryPenalty,
|
|
RecoveryPromoted: recoveryPromoted,
|
|
RecoveryDemoted: recoveryDemoted,
|
|
RecoveryReason: recoveryReason,
|
|
RecoveryPolicy: fabricServiceChannelRecoveryPolicyRef(recoveryPolicy),
|
|
PathScore: score,
|
|
ScoreReasons: dedupeStrings(scoreReasons),
|
|
ExpiresAt: expiresAt,
|
|
}, true
|
|
}
|
|
|
|
const fabricServiceChannelRecoveryHysteresisPenalty = 150
|
|
const fabricServiceChannelRecoveryPromotionMinSamples = 64
|
|
|
|
func fabricServiceChannelFeedbackRecoveryHysteresisActive(item fabricServiceChannelRouteFeedback, policy FabricServiceChannelRecoveryPolicy) bool {
|
|
if item.StalePolicy || item.StaleGeneration {
|
|
return false
|
|
}
|
|
return item.ManualRetry && !item.Fenced && item.ScoreAdjustment > 0 &&
|
|
containsString(item.Reasons, "service_channel_rolling_quality_window") &&
|
|
!fabricServiceChannelFeedbackRecoveryPromoted(item, policy)
|
|
}
|
|
|
|
func fabricServiceChannelFeedbackRecoveryPromoted(item fabricServiceChannelRouteFeedback, policy FabricServiceChannelRecoveryPolicy) bool {
|
|
if item.StalePolicy || item.StaleGeneration {
|
|
return false
|
|
}
|
|
return item.ManualRetry && !item.Fenced && item.ScoreAdjustment > 0 &&
|
|
containsString(item.Reasons, "service_channel_rolling_quality_window") &&
|
|
fabricServiceChannelFeedbackCleanRollingSamples(
|
|
item.QualityWindowSampleCount,
|
|
item.QualityWindowSuccessCount,
|
|
item.QualityWindowFailureCount,
|
|
item.QualityWindowSlowCount,
|
|
item.QualityWindowDropCount,
|
|
policy,
|
|
)
|
|
}
|
|
|
|
func fabricServiceChannelFeedbackRecoveryDemoted(item fabricServiceChannelRouteFeedback, policy FabricServiceChannelRecoveryPolicy) bool {
|
|
if item.StalePolicy || item.StaleGeneration {
|
|
return false
|
|
}
|
|
return item.ManualRetry && !fabricServiceChannelFeedbackRecoveryPromoted(item, policy) &&
|
|
((policy.DemotionFencedEnabled && item.Fenced) ||
|
|
(policy.DemotionRebuildEnabled && item.RouteRebuildRecommended) ||
|
|
item.DegradedFallbackRecommended ||
|
|
item.QualityWindowFailureCount >= policy.DemotionFailureThreshold ||
|
|
item.QualityWindowDropCount >= policy.DemotionDropThreshold ||
|
|
item.QualityWindowSlowCount >= policy.DemotionSlowThreshold ||
|
|
item.ScoreAdjustment < 0)
|
|
}
|
|
|
|
func fabricServiceChannelFeedbackRecoveryDemotionReason(item fabricServiceChannelRouteFeedback, policy FabricServiceChannelRecoveryPolicy) string {
|
|
if policy.DemotionFencedEnabled && item.Fenced {
|
|
return "service_channel_recovery_demoted_fenced"
|
|
}
|
|
if policy.DemotionRebuildEnabled && item.RouteRebuildRecommended {
|
|
return "service_channel_recovery_demoted_rebuild"
|
|
}
|
|
if item.QualityWindowFailureCount >= policy.DemotionFailureThreshold || item.QualityWindowDropCount >= policy.DemotionDropThreshold {
|
|
return "service_channel_recovery_demoted_failure"
|
|
}
|
|
if item.QualityWindowSlowCount >= policy.DemotionSlowThreshold {
|
|
return "service_channel_recovery_demoted_slow"
|
|
}
|
|
if item.DegradedFallbackRecommended {
|
|
return "service_channel_recovery_demoted_degraded_fallback"
|
|
}
|
|
if item.ScoreAdjustment < 0 {
|
|
return "service_channel_recovery_demoted_degraded"
|
|
}
|
|
return "service_channel_recovery_demoted"
|
|
}
|
|
|
|
func fabricServiceChannelFeedbackCleanRollingSamples(sampleCount, successCount, failureCount, slowCount, dropCount int, policy FabricServiceChannelRecoveryPolicy) bool {
|
|
return sampleCount >= policy.PromotionMinSamples &&
|
|
successCount >= policy.PromotionMinSamples &&
|
|
failureCount == 0 &&
|
|
slowCount == 0 &&
|
|
dropCount == 0
|
|
}
|
|
|
|
func fabricChannelsIntersect(a, b []string) bool {
|
|
for _, left := range a {
|
|
if containsString(b, left) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func selectFabricServicePrimaryRoute(routes []FabricServiceChannelRoute, selectedEntry, selectedExit string) (FabricServiceChannelRoute, []FabricServiceChannelRoute) {
|
|
if len(routes) == 0 {
|
|
return FabricServiceChannelRoute{}, nil
|
|
}
|
|
alternates := make([]FabricServiceChannelRoute, 0, len(routes)-1)
|
|
for _, route := range routes {
|
|
if route.Status != "authorized" {
|
|
continue
|
|
}
|
|
if route.SourceNodeID == selectedEntry && route.DestinationNodeID == selectedExit {
|
|
for _, alternate := range routes {
|
|
if alternate.RouteID != route.RouteID && alternate.Status == "authorized" {
|
|
alternates = append(alternates, alternate)
|
|
}
|
|
}
|
|
return route, alternates
|
|
}
|
|
}
|
|
primary := FabricServiceChannelRoute{}
|
|
for _, route := range routes {
|
|
if route.Status != "authorized" {
|
|
continue
|
|
}
|
|
if primary.RouteID == "" {
|
|
primary = route
|
|
continue
|
|
}
|
|
alternates = append(alternates, route)
|
|
}
|
|
return primary, alternates
|
|
}
|
|
|
|
type fabricServiceChannelRouteIntentReplacementScope struct {
|
|
EntryPoolKey string
|
|
ExitPoolKey string
|
|
ResourceKey string
|
|
}
|
|
|
|
func fabricServiceChannelRouteIntentMetadataKey(intent MeshRouteIntent, keys []string) string {
|
|
if len(intent.Policy) == 0 || !json.Valid(intent.Policy) {
|
|
return ""
|
|
}
|
|
var policy syntheticRoutePolicy
|
|
if err := json.Unmarshal(intent.Policy, &policy); err != nil {
|
|
return ""
|
|
}
|
|
for _, key := range keys {
|
|
value, ok := policy.Metadata[key]
|
|
if !ok {
|
|
continue
|
|
}
|
|
switch typed := value.(type) {
|
|
case string:
|
|
if trimmed := strings.TrimSpace(typed); trimmed != "" {
|
|
return key + ":" + trimmed
|
|
}
|
|
case fmt.Stringer:
|
|
if trimmed := strings.TrimSpace(typed.String()); trimmed != "" {
|
|
return key + ":" + trimmed
|
|
}
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func fabricServiceChannelRouteIntentReplacementScopes(intents []MeshRouteIntent) map[string]fabricServiceChannelRouteIntentReplacementScope {
|
|
out := map[string]fabricServiceChannelRouteIntentReplacementScope{}
|
|
for _, intent := range intents {
|
|
if routeID := strings.TrimSpace(intent.ID); routeID != "" {
|
|
out[routeID] = fabricServiceChannelRouteIntentReplacementScope{
|
|
EntryPoolKey: fabricServiceChannelRouteIntentMetadataKey(intent, []string{"entry_pool_id", "service_entry_pool_id", "fabric_entry_pool_id"}),
|
|
ExitPoolKey: fabricServiceChannelRouteIntentMetadataKey(intent, []string{"exit_pool_id", "service_exit_pool_id", "fabric_exit_pool_id"}),
|
|
ResourceKey: fabricServiceChannelRouteIntentMetadataKey(intent, []string{"service_resource_id", "resource_id", "fabric_service_resource_id"}),
|
|
}
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func fabricServiceChannelRoutesShareReplacementScope(fencedRoute, candidateRoute SyntheticMeshRouteConfig, scopes map[string]fabricServiceChannelRouteIntentReplacementScope) bool {
|
|
if fencedRoute.SourceNodeID == candidateRoute.SourceNodeID && fencedRoute.DestinationNodeID == candidateRoute.DestinationNodeID {
|
|
return true
|
|
}
|
|
fencedScope := scopes[fencedRoute.RouteID]
|
|
candidateScope := scopes[candidateRoute.RouteID]
|
|
sameResource := strings.TrimSpace(fencedScope.ResourceKey) != "" && fencedScope.ResourceKey == strings.TrimSpace(candidateScope.ResourceKey)
|
|
if fencedRoute.SourceNodeID == candidateRoute.SourceNodeID {
|
|
return sameResource || (strings.TrimSpace(fencedScope.ExitPoolKey) != "" && fencedScope.ExitPoolKey == strings.TrimSpace(candidateScope.ExitPoolKey))
|
|
}
|
|
if fencedRoute.DestinationNodeID == candidateRoute.DestinationNodeID {
|
|
return sameResource || (strings.TrimSpace(fencedScope.EntryPoolKey) != "" && fencedScope.EntryPoolKey == strings.TrimSpace(candidateScope.EntryPoolKey))
|
|
}
|
|
if sameResource &&
|
|
strings.TrimSpace(fencedScope.EntryPoolKey) != "" &&
|
|
fencedScope.EntryPoolKey == strings.TrimSpace(candidateScope.EntryPoolKey) &&
|
|
strings.TrimSpace(fencedScope.ExitPoolKey) != "" &&
|
|
fencedScope.ExitPoolKey == strings.TrimSpace(candidateScope.ExitPoolKey) {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func fabricServiceRoutesFencedForSelectedPair(routes []FabricServiceChannelRoute, selectedEntry, selectedExit string) bool {
|
|
for _, route := range routes {
|
|
if route.SourceNodeID == selectedEntry &&
|
|
route.DestinationNodeID == selectedExit &&
|
|
route.Status == "fenced_by_service_channel_feedback" {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func fabricServiceRoutesFencedForPool(routes []FabricServiceChannelRoute) bool {
|
|
for _, route := range routes {
|
|
if route.Status == "fenced_by_service_channel_feedback" {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func defaultFabricServiceQoS(serviceClass string) string {
|
|
switch serviceClass {
|
|
case FabricServiceClassVPNPackets:
|
|
return `{"priority":"bulk","interactive":false,"bulk_limit_mbps":0}`
|
|
case FabricServiceClassRemoteWorkspace:
|
|
return `{"priority":"interactive","interactive":true,"bulk_limit_mbps":0}`
|
|
case FabricServiceClassVideo:
|
|
return `{"priority":"interactive","interactive":true,"adaptive":true}`
|
|
default:
|
|
return `{"priority":"normal","interactive":false,"bulk_limit_mbps":0}`
|
|
}
|
|
}
|
|
|
|
func fabricServiceChannelHTTPIngress(serviceClass string) FabricServiceChannelHTTPIngress {
|
|
ingress := FabricServiceChannelHTTPIngress{
|
|
Type: "entry_direct_http_v1",
|
|
TokenHeader: "X-RAP-Service-Channel-Token",
|
|
ServiceClassHeader: "X-RAP-Service-Class",
|
|
ChannelClassHeader: "X-RAP-Channel-Class",
|
|
SupportedMethods: []string{"POST", "GET", "WEBSOCKET"},
|
|
}
|
|
switch serviceClass {
|
|
case FabricServiceClassRemoteWorkspace:
|
|
ingress.PathTemplate = "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/remote-workspaces/{resource_id}/streams/{channel_class}"
|
|
ingress.WebSocketPathTemplate = "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/remote-workspaces/{resource_id}/streams/ws"
|
|
ingress.PacketBatchFormat = "application/vnd.rap.remote-workspace-frame-batch.v1"
|
|
case FabricServiceClassVideo:
|
|
ingress.PathTemplate = "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/video-sessions/{resource_id}/streams/{channel_class}"
|
|
ingress.WebSocketPathTemplate = "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/video-sessions/{resource_id}/streams/ws"
|
|
ingress.PacketBatchFormat = "application/vnd.rap.video-frame-batch.v1"
|
|
case FabricServiceClassFileTransfer:
|
|
ingress.PathTemplate = "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/file-transfers/{resource_id}/chunks"
|
|
ingress.WebSocketPathTemplate = "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/file-transfers/{resource_id}/chunks/ws"
|
|
ingress.PacketBatchFormat = "application/vnd.rap.file-transfer-chunk-batch.v1"
|
|
default:
|
|
ingress.PathTemplate = "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/vpn-connections/{resource_id}/packets"
|
|
ingress.WebSocketPathTemplate = "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/vpn-connections/{resource_id}/packets/ws"
|
|
ingress.PacketBatchFormat = "application/vnd.rap.vpn-packet-batch.v1"
|
|
}
|
|
return ingress
|
|
}
|
|
|
|
func fabricServiceChannelDataPlaneContract(serviceClass string, poolPolicy FabricServiceChannelPoolPolicy, fallback FabricServiceChannelFallback) FabricServiceChannelDataPlaneContract {
|
|
backendRelayPolicy := "disabled"
|
|
if poolPolicy.BackendFallbackAllowed || fallback.Allowed || fallback.BackendRelay {
|
|
backendRelayPolicy = "degraded_fallback_only"
|
|
}
|
|
entryFailover := firstNonEmptyString(poolPolicy.EntryFailover, "automatic")
|
|
exitFailover := firstNonEmptyString(poolPolicy.ExitFailover, "automatic")
|
|
routeRebuild := firstNonEmptyString(poolPolicy.RouteRebuild, "automatic")
|
|
mode := "fabric_primary"
|
|
if fallback.Active {
|
|
mode = "degraded_backend_fallback"
|
|
}
|
|
return FabricServiceChannelDataPlaneContract{
|
|
SchemaVersion: "rap.fabric_service_channel_data_plane.v1",
|
|
Mode: mode,
|
|
ControlPlaneTransport: "backend_api",
|
|
WorkingDataTransport: "fabric_service_channel",
|
|
SteadyStateTransport: "fabric_route",
|
|
BackendRelayPolicy: backendRelayPolicy,
|
|
ProductionForwardingRequired: true,
|
|
ServiceNeutral: true,
|
|
ProtocolAgnostic: true,
|
|
LogicalFlowMode: "multi_flow_isolated",
|
|
RequiredFlowIsolationClasses: fabricServiceChannelFlowIsolationClasses(serviceClass),
|
|
RouteSelectionStrategy: firstNonEmptyString(poolPolicy.SelectionStrategy, "fastest_healthy"),
|
|
EntryFailoverMode: entryFailover,
|
|
ExitFailoverMode: exitFailover,
|
|
RouteRebuildMode: routeRebuild,
|
|
FailureDetectionSource: "route_quality_feedback_and_runtime_heartbeats",
|
|
DegradedFallbackVisibility: "explicit_access_telemetry_and_rebuild_health",
|
|
StableContractForServiceClass: serviceClass,
|
|
}
|
|
}
|
|
|
|
func fabricServiceChannelFlowIsolationClasses(serviceClass string) []string {
|
|
switch serviceClass {
|
|
case FabricServiceClassVPNPackets:
|
|
return []string{FabricChannelControl, FabricChannelInteractive, FabricChannelReliable, FabricChannelBulk, FabricChannelDroppable, "vpn_packet"}
|
|
case FabricServiceClassRemoteWorkspace:
|
|
return []string{FabricChannelControl, FabricChannelInteractive, FabricChannelReliable, FabricChannelBulk, FabricChannelDroppable}
|
|
case FabricServiceClassVideo:
|
|
return []string{FabricChannelControl, FabricChannelInteractive, FabricChannelDroppable}
|
|
case FabricServiceClassFileTransfer:
|
|
return []string{FabricChannelControl, FabricChannelReliable, FabricChannelBulk}
|
|
default:
|
|
return []string{FabricChannelControl, FabricChannelReliable}
|
|
}
|
|
}
|
|
|
|
func defaultFabricServiceFailover() string {
|
|
return `{"route_rebuild":"automatic","exit_failover":"automatic","sticky_session":true}`
|
|
}
|
|
|
|
func (s *Service) GetNodeSyntheticMeshConfig(ctx context.Context, input GetNodeSyntheticMeshConfigInput) (NodeSyntheticMeshConfig, error) {
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.NodeID = strings.TrimSpace(input.NodeID)
|
|
if input.ClusterID == "" || input.NodeID == "" {
|
|
return NodeSyntheticMeshConfig{}, ErrInvalidPayload
|
|
}
|
|
cfg := NodeSyntheticMeshConfig{
|
|
Enabled: false,
|
|
SchemaVersion: "c17z18.synthetic.v1",
|
|
ClusterID: input.ClusterID,
|
|
LocalNodeID: input.NodeID,
|
|
AuthorityRequired: true,
|
|
ConfigVersion: "disabled",
|
|
PeerDirectoryVersion: "disabled",
|
|
PolicyVersion: "disabled",
|
|
PeerEndpoints: map[string]string{},
|
|
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{},
|
|
PeerDirectory: []PeerDirectoryEntry{},
|
|
RecoverySeeds: []PeerRecoverySeed{},
|
|
RendezvousLeases: []PeerRendezvousLease{},
|
|
Routes: []SyntheticMeshRouteConfig{},
|
|
ProductionForwarding: false,
|
|
}
|
|
listenerConfig, err := s.nodeMeshListenerConfig(ctx, input)
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
cfg.MeshListener = listenerConfig
|
|
if listenerConfig != nil && listenerConfig.ProductionForwarding {
|
|
cfg.ProductionForwarding = true
|
|
}
|
|
flags, err := s.store.GetEffectiveNodeTestingFlags(ctx, input.ClusterID, input.NodeID)
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
if !flags.Enabled || !flags.SyntheticLinksEnabled {
|
|
return s.signSyntheticMeshConfig(ctx, cfg)
|
|
}
|
|
intents, err := s.store.ListRouteIntents(ctx, input.ClusterID)
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
cfg.Enabled = true
|
|
cfg.ConfigVersion = "c17z18-" + s.now().UTC().Format("20060102T150405Z")
|
|
cfg.PeerDirectoryVersion = cfg.ConfigVersion
|
|
cfg.PolicyVersion = cfg.ConfigVersion
|
|
if cfg.MeshListener != nil && cfg.MeshListener.ConfigVersion == "" {
|
|
cfg.MeshListener.ConfigVersion = cfg.ConfigVersion
|
|
}
|
|
meshLinks, err := s.store.ListMeshLinks(ctx, input.ClusterID)
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
relayPolicy := newRendezvousRelayPolicy(input.NodeID, meshLinks, s.now())
|
|
recoveryPolicy := s.fabricServiceChannelRecoveryPolicy(ctx, input.ClusterID)
|
|
cluster, err := s.store.GetCluster(ctx, input.ClusterID)
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
adaptivePolicy := fabricServiceChannelAdaptivePolicyFromCluster(cluster)
|
|
cfg.ServiceChannelAdaptivePolicy = &adaptivePolicy
|
|
routeProvenance := fabricServiceChannelRouteProvenanceFromIntents(intents)
|
|
serviceChannelFeedbackItems, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{
|
|
ClusterID: input.ClusterID,
|
|
ReporterNodeID: input.NodeID,
|
|
Now: s.now(),
|
|
})
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
cfg.ServiceChannelFeedback = serviceChannelRouteFeedbackReportWithPolicyAndProvenance(serviceChannelFeedbackItems, s.now(), recoveryPolicy, routeProvenance)
|
|
serviceChannelFeedback := fabricServiceChannelRouteFeedbackFromObservationsWithProvenance(serviceChannelFeedbackItems, s.now(), recoveryPolicy, routeProvenance)
|
|
cfg.ServiceChannelRemediationCommands, err = s.fabricServiceChannelRemediationCommandsForNode(ctx, input.ClusterID, input.NodeID, serviceChannelFeedback, s.now())
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
if err := s.recordFabricServiceChannelRemediationRebuildIntents(ctx, input.ClusterID, input.NodeID, cfg.ServiceChannelRemediationCommands, s.now()); err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
remediationRoutePathDecisions, err := s.resolveFabricServiceChannelRemediationRebuildIntents(ctx, input, cfg.ServiceChannelRemediationCommands, intents, serviceChannelFeedback, cfg.ConfigVersion, s.now())
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
serviceChannelExpiredFeedbackItems, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{
|
|
ClusterID: input.ClusterID,
|
|
ReporterNodeID: input.NodeID,
|
|
IncludeExpired: true,
|
|
Now: s.now(),
|
|
})
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
mergeFabricServiceChannelRouteFeedback(serviceChannelFeedback, fabricServiceChannelManualRetryFeedbackFromObservationsWithProvenance(serviceChannelExpiredFeedbackItems, s.now(), recoveryPolicy, routeProvenance))
|
|
localPerspective, err := s.localEndpointPerspective(ctx, input.ClusterID, input.NodeID)
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
peerDirectory := map[string]*PeerDirectoryEntry{}
|
|
recoverySeeds := map[string]PeerRecoverySeed{}
|
|
rendezvousLeases := map[string]PeerRendezvousLease{}
|
|
routePathDecisions := append([]RoutePathDecision{}, remediationRoutePathDecisions...)
|
|
for _, intent := range intents {
|
|
route, peers, candidates, seeds, policyLeases, ok := s.syntheticRouteFromIntent(input, intent)
|
|
if !ok {
|
|
continue
|
|
}
|
|
if feedback, ok := serviceChannelFeedback[route.RouteID]; ok && feedback.Fenced {
|
|
replacementDecision := s.serviceChannelRouteReplacementDecision(input, route, intents, serviceChannelFeedback, cfg.ConfigVersion)
|
|
routePathDecisions = append(routePathDecisions, replacementDecision)
|
|
continue
|
|
}
|
|
reportedPeers, reportedCandidates, err := s.reportedEndpointConfig(ctx, input.ClusterID, input.NodeID, route.Hops, localPerspective)
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
feedback, err := s.rendezvousRelayFeedback(ctx, input.ClusterID, route.Hops, s.now())
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
relayPolicy.addFeedback(feedback)
|
|
replacementHints, err := s.rendezvousRelayReplacementHints(ctx, input.ClusterID, route.Hops, s.now())
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
relayPolicy.addReplacementHints(replacementHints)
|
|
relayPolicy.addFeedback(replacementHintFeedback(replacementHints, s.now()))
|
|
relayPolicy.addFeedback(rendezvousRelayRouteHealthFeedback(input.NodeID, route, meshLinks, s.now()))
|
|
for nodeID, endpoint := range reportedPeers {
|
|
peers[nodeID] = endpoint
|
|
}
|
|
for nodeID, items := range reportedCandidates {
|
|
candidates[nodeID] = append(candidates[nodeID], items...)
|
|
}
|
|
routeLeases := scopedRendezvousLeases(policyLeases, route, input.NodeID, relayPolicy, s.now())
|
|
routeLeases = append(routeLeases, derivedRendezvousLeases(route, peers, candidates, input.NodeID, relayPolicy, s.now())...)
|
|
cfg.Routes = append(cfg.Routes, route)
|
|
routePathDecisions = append(routePathDecisions, routePathDecisionForRoute(route, input.NodeID, routeLeases, relayPolicy, cfg.ConfigVersion, serviceChannelFeedback[route.RouteID]))
|
|
mergePeerDirectoryRoute(peerDirectory, route, input.NodeID)
|
|
for nodeID, endpoint := range peers {
|
|
if strings.TrimSpace(nodeID) != "" && strings.TrimSpace(endpoint) != "" {
|
|
cfg.PeerEndpoints[nodeID] = endpoint
|
|
peerDirectoryEntry(peerDirectory, nodeID).EndpointCount++
|
|
}
|
|
}
|
|
for nodeID, nodeCandidates := range candidates {
|
|
if strings.TrimSpace(nodeID) == "" || len(nodeCandidates) == 0 {
|
|
continue
|
|
}
|
|
cfg.PeerEndpointCandidates[nodeID] = append(cfg.PeerEndpointCandidates[nodeID], nodeCandidates...)
|
|
mergePeerDirectoryCandidates(peerDirectory, nodeID, nodeCandidates)
|
|
}
|
|
mergeRecoverySeeds(recoverySeeds, seeds)
|
|
mergeRendezvousLeases(rendezvousLeases, routeLeases)
|
|
}
|
|
if err := s.addCoreMeshBootstrapPeers(ctx, input, &cfg, peerDirectory, recoverySeeds, rendezvousLeases, localPerspective); err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
cfg.RecoverySeeds = sortedRecoverySeeds(recoverySeeds, maxScopedRecoverySeeds)
|
|
cfg.RendezvousLeases = sortedRendezvousLeases(rendezvousLeases, maxScopedRendezvousLeases)
|
|
cfg.RendezvousRelayPolicy = relayPolicy.report()
|
|
cfg.RoutePathDecisions = routePathDecisionReportWithRecoveryPolicy(cfg.ConfigVersion, routePathDecisions, recoveryPolicy)
|
|
_ = s.recordFabricServiceChannelRouteRebuildAttempts(ctx, input, cfg.RoutePathDecisions, cfg.ServiceChannelFeedback)
|
|
markPeerDirectoryRecoverySeeds(peerDirectory, cfg.RecoverySeeds)
|
|
markPeerDirectoryRendezvousLeases(peerDirectory, cfg.RendezvousLeases, input.NodeID)
|
|
cfg.PeerDirectory = sortedPeerDirectory(peerDirectory)
|
|
return s.signSyntheticMeshConfig(ctx, cfg)
|
|
}
|
|
|
|
func (s *Service) recordFabricServiceChannelRouteRebuildAttempts(ctx context.Context, input GetNodeSyntheticMeshConfigInput, report *RoutePathDecisionReport, feedbackReport *FabricServiceChannelRouteFeedbackReport) error {
|
|
if report == nil || len(report.Decisions) == 0 {
|
|
return nil
|
|
}
|
|
feedbackByRoute := map[string]FabricServiceChannelRouteFeedbackObservation{}
|
|
if feedbackReport != nil {
|
|
for _, item := range feedbackReport.Observations {
|
|
if strings.TrimSpace(item.RouteID) != "" {
|
|
feedbackByRoute[item.RouteID] = item
|
|
}
|
|
}
|
|
}
|
|
for _, decision := range report.Decisions {
|
|
if strings.TrimSpace(decision.RebuildRequestID) == "" {
|
|
continue
|
|
}
|
|
feedback := feedbackByRoute[decision.RouteID]
|
|
serviceClass := firstNonEmptyString(feedback.ServiceClass, FabricServiceClassVPNPackets)
|
|
outcome := "degraded_fallback"
|
|
if strings.TrimSpace(decision.ReplacementRouteID) != "" {
|
|
outcome = "replacement_selected"
|
|
} else if decision.DecisionSource == "service_channel_feedback_no_alternate" {
|
|
outcome = "no_alternate"
|
|
}
|
|
payload := mustJSONRaw(map[string]any{
|
|
"schema_version": "c18z98.route_rebuild_attempt_correlation.v1",
|
|
"decision_id": decision.DecisionID,
|
|
"score_reasons": decision.ScoreReasons,
|
|
"path_score": decision.PathScore,
|
|
"local_role": decision.LocalRole,
|
|
"previous_hop_id": decision.PreviousHopID,
|
|
"next_hop_id": decision.NextHopID,
|
|
"control_plane_only": decision.ControlPlaneOnly,
|
|
"production_forwarding": decision.ProductionForwarding,
|
|
"decision_expires_at": decision.ExpiresAt.UTC().Format(time.RFC3339Nano),
|
|
"feedback_observation_id": decision.FeedbackObservationID,
|
|
"feedback_source": decision.FeedbackSource,
|
|
"feedback_observed_at": formatOptionalTime(decision.FeedbackObservedAt),
|
|
"feedback_expires_at": formatOptionalTime(decision.FeedbackExpiresAt),
|
|
"feedback_channel_id": decision.FeedbackChannelID,
|
|
"feedback_resource_id": decision.FeedbackResourceID,
|
|
"feedback_violation_status": decision.FeedbackViolationStatus,
|
|
"feedback_violation_reason": decision.FeedbackViolationReason,
|
|
})
|
|
_, err := s.store.RecordFabricServiceChannelRouteRebuildAttempt(ctx, RecordFabricServiceChannelRouteRebuildAttemptInput{
|
|
ClusterID: input.ClusterID,
|
|
ReporterNodeID: input.NodeID,
|
|
ServiceClass: serviceClass,
|
|
RouteID: decision.RouteID,
|
|
ReplacementRouteID: decision.ReplacementRouteID,
|
|
RebuildRequestID: decision.RebuildRequestID,
|
|
RebuildStatus: decision.RebuildStatus,
|
|
RebuildReason: decision.RebuildReason,
|
|
RebuildAttempt: decision.RebuildAttempt,
|
|
DecisionSource: decision.DecisionSource,
|
|
Outcome: outcome,
|
|
Generation: decision.Generation,
|
|
PolicyFingerprint: feedback.EffectivePolicyFingerprint,
|
|
ObservedPolicyFingerprint: feedback.ObservedPolicyFingerprint,
|
|
ObservedRouteGeneration: feedback.ObservedRouteGeneration,
|
|
EffectiveRouteGeneration: feedback.EffectiveRouteGeneration,
|
|
FeedbackStatus: feedback.FeedbackStatus,
|
|
FeedbackObservationID: decision.FeedbackObservationID,
|
|
FeedbackSource: decision.FeedbackSource,
|
|
FeedbackObservedAt: decision.FeedbackObservedAt,
|
|
FeedbackExpiresAt: decision.FeedbackExpiresAt,
|
|
FeedbackChannelID: decision.FeedbackChannelID,
|
|
FeedbackResourceID: decision.FeedbackResourceID,
|
|
FeedbackViolationStatus: decision.FeedbackViolationStatus,
|
|
FeedbackViolationReason: decision.FeedbackViolationReason,
|
|
FeedbackScoreAdjustment: feedback.ScoreAdjustment,
|
|
FeedbackEffectiveScoreAdjustment: feedback.EffectiveScoreAdjustment,
|
|
FeedbackReasons: append([]string{}, feedback.Reasons...),
|
|
LastError: feedback.LastError,
|
|
ConsecutiveFailures: feedback.ConsecutiveFailures,
|
|
StallCount: feedback.StallCount,
|
|
LastSendDurationMs: feedback.LastSendDurationMs,
|
|
OldHops: append([]string{}, decision.OriginalHops...),
|
|
ReplacementHops: append([]string{}, decision.EffectiveHops...),
|
|
Payload: payload,
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Service) autoWarmFabricServiceChannelRouteRebuildAttemptSnapshot(ctx context.Context, clusterID string, attempt FabricServiceChannelRouteRebuildAttempt, now time.Time) (bool, error) {
|
|
if fabricServiceChannelRouteRebuildHasCorrelationSnapshot(attempt) {
|
|
return false, nil
|
|
}
|
|
nodeID := strings.TrimSpace(attempt.ReporterNodeID)
|
|
if nodeID == "" {
|
|
return false, ErrInvalidPayload
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 120)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
attempt = enrichFabricServiceChannelRouteRebuildAttempt(attempt, heartbeats, now)
|
|
if !attempt.NodeTransitionMatched && !attempt.NodeRouteGenerationMatched && attempt.PostRebuildSelectedRouteID == "" && attempt.PostRebuildSendPackets == 0 && attempt.PostRebuildSendFlowPackets == 0 {
|
|
return false, nil
|
|
}
|
|
attempt.CorrelationSnapshotAt = &now
|
|
if err := s.store.UpdateFabricServiceChannelRouteRebuildCorrelationSnapshot(ctx, fabricServiceChannelRouteRebuildCorrelationSnapshotInput(attempt, now)); err != nil {
|
|
return false, err
|
|
}
|
|
return true, nil
|
|
}
|
|
|
|
func formatOptionalTime(value *time.Time) string {
|
|
if value == nil || value.IsZero() {
|
|
return ""
|
|
}
|
|
return value.UTC().Format(time.RFC3339Nano)
|
|
}
|
|
|
|
func (s *Service) autoWarmFabricServiceChannelRouteRebuildSnapshotsAfterHeartbeat(ctx context.Context, heartbeat NodeHeartbeat) error {
|
|
clusterID := strings.TrimSpace(heartbeat.ClusterID)
|
|
nodeID := strings.TrimSpace(heartbeat.NodeID)
|
|
if clusterID == "" || nodeID == "" {
|
|
return nil
|
|
}
|
|
now := heartbeat.ObservedAt
|
|
if now.IsZero() {
|
|
now = s.now()
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
attempts, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{
|
|
ClusterID: clusterID,
|
|
ReporterNodeID: nodeID,
|
|
Limit: 5,
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
warmedCount := 0
|
|
freshCount := 0
|
|
errorCount := 0
|
|
warmedAttemptIDs := []string{}
|
|
warmedRouteIDs := []string{}
|
|
warmedRebuildRequestIDs := []string{}
|
|
warmedGenerations := []string{}
|
|
for _, attempt := range attempts {
|
|
if fabricServiceChannelRouteRebuildHasCorrelationSnapshot(attempt) {
|
|
freshCount++
|
|
continue
|
|
}
|
|
warmed, err := s.autoWarmFabricServiceChannelRouteRebuildAttemptSnapshot(ctx, clusterID, attempt, now)
|
|
if err != nil {
|
|
errorCount++
|
|
continue
|
|
}
|
|
if warmed {
|
|
warmedCount++
|
|
warmedAttemptIDs = append(warmedAttemptIDs, attempt.ID)
|
|
warmedRouteIDs = append(warmedRouteIDs, attempt.RouteID)
|
|
warmedRebuildRequestIDs = append(warmedRebuildRequestIDs, attempt.RebuildRequestID)
|
|
warmedGenerations = append(warmedGenerations, attempt.Generation)
|
|
} else {
|
|
freshCount++
|
|
}
|
|
}
|
|
if warmedCount == 0 && errorCount == 0 {
|
|
return nil
|
|
}
|
|
targetID := nodeID
|
|
return s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &clusterID,
|
|
EventType: "fabric.service_channel_rebuild_snapshot.auto_warmup",
|
|
TargetType: "fabric_service_channel_route_rebuild_snapshot",
|
|
TargetID: &targetID,
|
|
Payload: mustJSONRaw(map[string]any{
|
|
"schema_version": "c18z45.rebuild_snapshot_auto_warmup.v1",
|
|
"trigger": "node_heartbeat",
|
|
"reporter_node_id": nodeID,
|
|
"heartbeat_id": heartbeat.ID,
|
|
"scanned_count": len(attempts),
|
|
"warmed_count": warmedCount,
|
|
"already_fresh_count": freshCount,
|
|
"error_count": errorCount,
|
|
"warmed_attempt_ids": warmedAttemptIDs,
|
|
"warmed_route_ids": warmedRouteIDs,
|
|
"warmed_rebuild_ids": warmedRebuildRequestIDs,
|
|
"warmed_generations": warmedGenerations,
|
|
}),
|
|
CreatedAt: now.UTC(),
|
|
})
|
|
}
|
|
|
|
func (s *Service) nodeMeshListenerConfig(ctx context.Context, input GetNodeSyntheticMeshConfigInput) (*NodeMeshListenerConfig, error) {
|
|
workloads, err := s.store.ListDesiredWorkloads(ctx, input.ClusterID, input.NodeID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
for _, workload := range workloads {
|
|
if strings.TrimSpace(workload.ServiceType) != "mesh-listener" {
|
|
continue
|
|
}
|
|
cfg, err := nodeMeshListenerConfigFromDesired(workload)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return cfg, nil
|
|
}
|
|
return nil, nil
|
|
}
|
|
|
|
func (s *Service) desiredMeshListenerEndpointConfig(ctx context.Context, clusterID, nodeID string, priority int) (string, []PeerEndpointCandidate, error) {
|
|
listener, err := s.nodeMeshListenerConfig(ctx, GetNodeSyntheticMeshConfigInput{ClusterID: clusterID, NodeID: nodeID})
|
|
if err != nil {
|
|
return "", nil, err
|
|
}
|
|
if listener == nil ||
|
|
strings.TrimSpace(listener.DesiredState) != "enabled" ||
|
|
strings.TrimSpace(listener.AdvertiseEndpoint) == "" {
|
|
return "", nil, nil
|
|
}
|
|
endpoint := strings.TrimRight(strings.TrimSpace(listener.AdvertiseEndpoint), "/")
|
|
if isUnusableLocalPeerEndpoint(endpoint) {
|
|
return "", nil, nil
|
|
}
|
|
transport := firstNonEmptyString(listener.AdvertiseTransport, "direct_http")
|
|
connectivityMode := firstNonEmptyString(listener.ConnectivityMode, "direct")
|
|
natType := firstNonEmptyString(listener.NATType, "unknown")
|
|
metadata, err := json.Marshal(map[string]any{
|
|
"source": "desired_workload.mesh-listener",
|
|
"config_version": listener.ConfigVersion,
|
|
"listen_addr": listener.ListenAddr,
|
|
})
|
|
if err != nil {
|
|
return "", nil, err
|
|
}
|
|
candidate := PeerEndpointCandidate{
|
|
EndpointID: nodeID + "-desired-mesh-listener",
|
|
NodeID: nodeID,
|
|
Transport: transport,
|
|
Address: endpoint,
|
|
Reachability: reachabilityFromConnectivityMode(connectivityMode),
|
|
NATType: natType,
|
|
ConnectivityMode: connectivityMode,
|
|
Region: listener.Region,
|
|
Priority: priority,
|
|
PolicyTags: []string{"operator-configured", "desired-mesh-listener"},
|
|
Metadata: metadata,
|
|
}
|
|
if err := validatePeerEndpointCandidates(map[string][]PeerEndpointCandidate{nodeID: []PeerEndpointCandidate{candidate}}, []string{nodeID}); err != nil {
|
|
return "", nil, err
|
|
}
|
|
return endpoint, []PeerEndpointCandidate{candidate}, nil
|
|
}
|
|
|
|
func nodeMeshListenerConfigFromDesired(workload NodeWorkloadDesiredState) (*NodeMeshListenerConfig, error) {
|
|
var raw map[string]any
|
|
if len(workload.Config) > 0 {
|
|
if err := json.Unmarshal(workload.Config, &raw); err != nil {
|
|
return nil, ErrInvalidPayload
|
|
}
|
|
}
|
|
value := func(key string) string {
|
|
if raw == nil {
|
|
return ""
|
|
}
|
|
if text, ok := raw[key].(string); ok {
|
|
return strings.TrimSpace(text)
|
|
}
|
|
return ""
|
|
}
|
|
intValue := func(key string) int {
|
|
if raw == nil {
|
|
return 0
|
|
}
|
|
switch v := raw[key].(type) {
|
|
case float64:
|
|
return int(v)
|
|
case int:
|
|
return v
|
|
}
|
|
return 0
|
|
}
|
|
boolValue := func(key string) bool {
|
|
if raw == nil {
|
|
return false
|
|
}
|
|
switch v := raw[key].(type) {
|
|
case bool:
|
|
return v
|
|
case string:
|
|
switch strings.ToLower(strings.TrimSpace(v)) {
|
|
case "1", "true", "yes", "enabled":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
mode := strings.ToLower(value("listen_port_mode"))
|
|
if workload.DesiredState != "enabled" {
|
|
mode = "disabled"
|
|
}
|
|
if mode == "" {
|
|
mode = "manual"
|
|
}
|
|
switch mode {
|
|
case "manual", "auto", "disabled":
|
|
default:
|
|
return nil, ErrInvalidPayload
|
|
}
|
|
listenAddr := value("listen_addr")
|
|
if listenAddr == "" && mode != "disabled" {
|
|
listenAddr = ":19131"
|
|
}
|
|
start := intValue("auto_port_start")
|
|
end := intValue("auto_port_end")
|
|
if start <= 0 {
|
|
start = 19131
|
|
}
|
|
if end <= 0 {
|
|
end = 19231
|
|
}
|
|
if start > end {
|
|
return nil, ErrInvalidPayload
|
|
}
|
|
productionForwarding := boolValue("production_forwarding") || boolValue("production_forwarding_enabled")
|
|
return &NodeMeshListenerConfig{
|
|
SchemaVersion: "c17z23.mesh_listener_config.v1",
|
|
Source: "desired_workload.mesh-listener",
|
|
DesiredState: firstNonEmptyString(workload.DesiredState, "disabled"),
|
|
ListenAddr: listenAddr,
|
|
ListenPortMode: mode,
|
|
AutoPortStart: start,
|
|
AutoPortEnd: end,
|
|
AdvertiseEndpoint: strings.TrimRight(value("advertise_endpoint"), "/"),
|
|
AdvertiseTransport: value("advertise_transport"),
|
|
ConnectivityMode: value("connectivity_mode"),
|
|
NATType: value("nat_type"),
|
|
Region: value("region"),
|
|
ConfigVersion: stringPtrValue(workload.Version),
|
|
UpdatedByUserID: stringPtrValue(workload.UpdatedByUserID),
|
|
UpdatedAt: workload.UpdatedAt.UTC().Format(time.RFC3339Nano),
|
|
ControlPlaneOnly: !productionForwarding,
|
|
ProductionForwarding: productionForwarding,
|
|
}, nil
|
|
}
|
|
|
|
func (s *Service) addCoreMeshBootstrapPeers(ctx context.Context, input GetNodeSyntheticMeshConfigInput, cfg *NodeSyntheticMeshConfig, peerDirectory map[string]*PeerDirectoryEntry, recoverySeeds map[string]PeerRecoverySeed, rendezvousLeases map[string]PeerRendezvousLease, localPerspective endpointPerspective) error {
|
|
roles, err := s.store.ListNodeRoleAssignments(ctx, input.ClusterID, input.NodeID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !hasActiveNodeRole(roles, "core-mesh") {
|
|
return nil
|
|
}
|
|
nodes, err := s.store.ListClusterNodes(ctx, input.ClusterID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
sort.SliceStable(nodes, func(i, j int) bool {
|
|
if nodes[i].HealthStatus != nodes[j].HealthStatus {
|
|
return nodes[i].HealthStatus == "healthy"
|
|
}
|
|
iSeen := nodeLastSeen(nodes[i])
|
|
jSeen := nodeLastSeen(nodes[j])
|
|
if !iSeen.Equal(jSeen) {
|
|
return iSeen.After(jSeen)
|
|
}
|
|
return nodes[i].CreatedAt.Before(nodes[j].CreatedAt)
|
|
})
|
|
added := 0
|
|
for _, node := range nodes {
|
|
if node.ID == input.NodeID ||
|
|
node.ID == "" ||
|
|
node.MembershipStatus != "active" ||
|
|
node.RegistrationStatus != NodeRegistrationActive ||
|
|
node.HealthStatus != "healthy" {
|
|
continue
|
|
}
|
|
desiredEndpoint, desiredCandidates, err := s.desiredMeshListenerEndpointConfig(ctx, input.ClusterID, node.ID, added)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if added >= defaultCoreMeshBootstrapPeerTarget && !hasDirectUsableEndpointCandidate(desiredCandidates) {
|
|
continue
|
|
}
|
|
heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, node.ID, 1)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if len(heartbeats) == 0 && desiredEndpoint == "" && len(desiredCandidates) == 0 {
|
|
continue
|
|
}
|
|
endpoint := desiredEndpoint
|
|
candidates := append([]PeerEndpointCandidate{}, desiredCandidates...)
|
|
if len(heartbeats) > 0 {
|
|
reportedEndpoint, reportedCandidates, ok := endpointReportFromHeartbeat(heartbeats[0])
|
|
if ok {
|
|
if endpoint == "" {
|
|
endpoint = reportedEndpoint
|
|
}
|
|
candidates = append(candidates, reportedCandidates...)
|
|
}
|
|
}
|
|
endpoint, candidates = scopeEndpointReportForLocal(localPerspective, endpoint, candidates)
|
|
if endpoint != "" {
|
|
cfg.PeerEndpoints[node.ID] = endpoint
|
|
peerDirectoryEntry(peerDirectory, node.ID).EndpointCount++
|
|
}
|
|
if len(candidates) > 0 {
|
|
cfg.PeerEndpointCandidates[node.ID] = append(cfg.PeerEndpointCandidates[node.ID], candidates...)
|
|
mergePeerDirectoryCandidates(peerDirectory, node.ID, candidates)
|
|
if lease, ok := controlPlaneBootstrapRendezvousLease(input.ClusterID, node.ID, candidates, localPerspective, s.now()); ok {
|
|
mergeRendezvousLeases(rendezvousLeases, []PeerRendezvousLease{lease})
|
|
}
|
|
}
|
|
seed := recoverySeedFromEndpointReport(node.ID, endpoint, candidates, added)
|
|
if seed.NodeID != "" && !endpointCandidateRequiresRendezvous(PeerEndpointCandidate{
|
|
Address: seed.Endpoint,
|
|
Transport: seed.Transport,
|
|
ConnectivityMode: seed.ConnectivityMode,
|
|
Reachability: reachabilityFromConnectivityMode(seed.ConnectivityMode),
|
|
}) {
|
|
mergeRecoverySeeds(recoverySeeds, []PeerRecoverySeed{seed})
|
|
}
|
|
added++
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func hasDirectUsableEndpointCandidate(candidates []PeerEndpointCandidate) bool {
|
|
for _, candidate := range candidates {
|
|
if strings.TrimSpace(candidate.Address) != "" &&
|
|
!endpointCandidatePrivateForOffsite(candidate) &&
|
|
!endpointCandidateRequiresRendezvous(candidate) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (s *Service) signSyntheticMeshConfig(ctx context.Context, cfg NodeSyntheticMeshConfig) (NodeSyntheticMeshConfig, error) {
|
|
authorityKey, err := s.ensureClusterAuthority(ctx, cfg.ClusterID, nil)
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
cfg.AuthorityRequired = true
|
|
cfg.ClusterAuthority = authorityDescriptor(authorityKey)
|
|
unsigned := cfg
|
|
unsigned.AuthorityPayload = nil
|
|
unsigned.AuthoritySignature = nil
|
|
rawConfig, err := json.Marshal(unsigned)
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
configHash, err := clusterauth.HashRaw(rawConfig)
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
issuedAt := s.now().UTC()
|
|
payload := clusterMeshConfigAuthorityPayload{
|
|
SchemaVersion: clusterMeshConfigAuthoritySchema,
|
|
ClusterID: cfg.ClusterID,
|
|
LocalNodeID: cfg.LocalNodeID,
|
|
ConfigVersion: cfg.ConfigVersion,
|
|
ConfigSHA256: configHash,
|
|
IssuedAt: issuedAt,
|
|
ExpiresAt: issuedAt.Add(5 * time.Minute),
|
|
ControlPlaneOnly: !cfg.ProductionForwarding,
|
|
ProductionForwarding: cfg.ProductionForwarding,
|
|
}
|
|
rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, issuedAt)
|
|
if err != nil {
|
|
return NodeSyntheticMeshConfig{}, err
|
|
}
|
|
cfg.AuthorityPayload = rawPayload
|
|
cfg.AuthoritySignature = &signature
|
|
return cfg, nil
|
|
}
|
|
|
|
func (s *Service) RecordNodeTelemetry(ctx context.Context, input RecordNodeTelemetryInput) (NodeTelemetryObservation, error) {
|
|
if input.ClusterID == "" || input.NodeID == "" {
|
|
return NodeTelemetryObservation{}, ErrInvalidPayload
|
|
}
|
|
input.Payload = defaultJSON(input.Payload, `{}`)
|
|
if !json.Valid(input.Payload) {
|
|
return NodeTelemetryObservation{}, errors.New("telemetry payload must be valid json")
|
|
}
|
|
if input.ObservedAt.IsZero() {
|
|
input.ObservedAt = s.now()
|
|
}
|
|
return s.store.RecordNodeTelemetry(ctx, input)
|
|
}
|
|
|
|
func (s *Service) ListNodeTelemetry(ctx context.Context, actorUserID, clusterID, nodeID string, limit int) ([]NodeTelemetryObservation, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListNodeTelemetry(ctx, clusterID, nodeID, limit)
|
|
}
|
|
|
|
func (s *Service) SetDesiredWorkload(ctx context.Context, input SetDesiredWorkloadInput) (NodeWorkloadDesiredState, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return NodeWorkloadDesiredState{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return NodeWorkloadDesiredState{}, err
|
|
}
|
|
input.ServiceType = strings.TrimSpace(input.ServiceType)
|
|
if input.ClusterID == "" || input.NodeID == "" || input.ServiceType == "" {
|
|
return NodeWorkloadDesiredState{}, ErrInvalidPayload
|
|
}
|
|
if input.DesiredState == "" {
|
|
input.DesiredState = "disabled"
|
|
}
|
|
if input.RuntimeMode == "" {
|
|
input.RuntimeMode = "container"
|
|
}
|
|
input.Config = defaultJSON(input.Config, `{}`)
|
|
input.Environment = defaultJSON(input.Environment, `{}`)
|
|
if !json.Valid(input.Config) || !json.Valid(input.Environment) {
|
|
return NodeWorkloadDesiredState{}, errors.New("config and environment must be valid json")
|
|
}
|
|
item, err := s.store.SetDesiredWorkload(ctx, input)
|
|
if err != nil {
|
|
return NodeWorkloadDesiredState{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "node_workload.desired_state_set",
|
|
TargetType: "node",
|
|
TargetID: &input.NodeID,
|
|
Payload: json.RawMessage(`{"supervision_runtime":"stub_c5"}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) ListDesiredWorkloads(ctx context.Context, actorUserID, clusterID, nodeID string) ([]NodeWorkloadDesiredState, error) {
|
|
actorUserID = strings.TrimSpace(actorUserID)
|
|
if actorUserID != "" {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
if clusterID == "" || nodeID == "" {
|
|
return nil, ErrInvalidPayload
|
|
}
|
|
return s.store.ListDesiredWorkloads(ctx, clusterID, nodeID)
|
|
}
|
|
|
|
func (s *Service) ReportWorkloadStatus(ctx context.Context, input ReportWorkloadStatusInput) (NodeWorkloadStatus, error) {
|
|
input.ServiceType = strings.TrimSpace(input.ServiceType)
|
|
if input.ClusterID == "" || input.NodeID == "" || input.ServiceType == "" {
|
|
return NodeWorkloadStatus{}, ErrInvalidPayload
|
|
}
|
|
if input.ReportedState == "" {
|
|
input.ReportedState = "unknown"
|
|
}
|
|
if input.RuntimeMode == "" {
|
|
input.RuntimeMode = "container"
|
|
}
|
|
input.StatusPayload = defaultJSON(input.StatusPayload, `{}`)
|
|
if !json.Valid(input.StatusPayload) {
|
|
return NodeWorkloadStatus{}, errors.New("status_payload must be valid json")
|
|
}
|
|
return s.store.ReportWorkloadStatus(ctx, input)
|
|
}
|
|
|
|
func (s *Service) ListLatestWorkloadStatuses(ctx context.Context, actorUserID, clusterID, nodeID string) ([]NodeWorkloadStatus, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListLatestWorkloadStatuses(ctx, clusterID, nodeID)
|
|
}
|
|
|
|
func (s *Service) ReportMeshLink(ctx context.Context, input ReportMeshLinkInput) (MeshLinkObservation, error) {
|
|
if input.ClusterID == "" || input.SourceNodeID == "" || input.TargetNodeID == "" {
|
|
return MeshLinkObservation{}, ErrInvalidPayload
|
|
}
|
|
if input.LinkStatus == "" {
|
|
input.LinkStatus = "unknown"
|
|
}
|
|
input.Metadata = defaultJSON(input.Metadata, `{}`)
|
|
if !json.Valid(input.Metadata) {
|
|
return MeshLinkObservation{}, errors.New("metadata must be valid json")
|
|
}
|
|
return s.store.ReportMeshLink(ctx, input)
|
|
}
|
|
|
|
func (s *Service) ListMeshLinks(ctx context.Context, actorUserID, clusterID string) ([]MeshLinkObservation, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListMeshLinks(ctx, clusterID)
|
|
}
|
|
|
|
func (s *Service) CreateRouteIntent(ctx context.Context, input CreateRouteIntentInput) (MeshRouteIntent, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return MeshRouteIntent{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return MeshRouteIntent{}, err
|
|
}
|
|
if input.ClusterID == "" || input.ServiceClass == "" {
|
|
return MeshRouteIntent{}, ErrInvalidPayload
|
|
}
|
|
if input.Priority == 0 {
|
|
input.Priority = 100
|
|
}
|
|
input.SourceSelector = defaultJSON(input.SourceSelector, `{}`)
|
|
input.DestinationSelector = defaultJSON(input.DestinationSelector, `{}`)
|
|
input.Policy = defaultJSON(input.Policy, `{}`)
|
|
if !json.Valid(input.SourceSelector) || !json.Valid(input.DestinationSelector) || !json.Valid(input.Policy) {
|
|
return MeshRouteIntent{}, errors.New("source_selector, destination_selector, and policy must be valid json")
|
|
}
|
|
item, err := s.store.CreateRouteIntent(ctx, input)
|
|
if err != nil {
|
|
return MeshRouteIntent{}, err
|
|
}
|
|
item = routeIntentWithLifecycle(item, s.now())
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "mesh.route_intent.created",
|
|
TargetType: "mesh_route_intent",
|
|
TargetID: &item.ID,
|
|
Payload: json.RawMessage(`{"traffic_forwarding_enabled":false}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) ListRouteIntents(ctx context.Context, actorUserID, clusterID string) ([]MeshRouteIntent, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
items, err := s.store.ListRouteIntents(ctx, clusterID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return routeIntentsWithLifecycle(items, s.now()), nil
|
|
}
|
|
|
|
func (s *Service) ExpireRouteIntent(ctx context.Context, input RouteIntentLifecycleInput) (MeshRouteIntent, error) {
|
|
input.ActorUserID = strings.TrimSpace(input.ActorUserID)
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.RouteIntentID = strings.TrimSpace(input.RouteIntentID)
|
|
input.Reason = strings.TrimSpace(input.Reason)
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return MeshRouteIntent{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return MeshRouteIntent{}, err
|
|
}
|
|
if input.ClusterID == "" || input.RouteIntentID == "" {
|
|
return MeshRouteIntent{}, ErrInvalidPayload
|
|
}
|
|
if input.Reason == "" {
|
|
input.Reason = "operator expired route intent"
|
|
}
|
|
expiresAt := s.now().UTC()
|
|
item, err := s.store.ExpireRouteIntent(ctx, input, expiresAt)
|
|
if err != nil {
|
|
return MeshRouteIntent{}, err
|
|
}
|
|
item = routeIntentWithLifecycle(item, s.now())
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "mesh.route_intent.expired",
|
|
TargetType: "mesh_route_intent",
|
|
TargetID: &item.ID,
|
|
Payload: mustJSONRaw(map[string]any{"reason": input.Reason, "expires_at": expiresAt.Format(time.RFC3339Nano)}),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) DisableRouteIntent(ctx context.Context, input RouteIntentLifecycleInput) (MeshRouteIntent, error) {
|
|
input.ActorUserID = strings.TrimSpace(input.ActorUserID)
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.RouteIntentID = strings.TrimSpace(input.RouteIntentID)
|
|
input.Reason = strings.TrimSpace(input.Reason)
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return MeshRouteIntent{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return MeshRouteIntent{}, err
|
|
}
|
|
if input.ClusterID == "" || input.RouteIntentID == "" {
|
|
return MeshRouteIntent{}, ErrInvalidPayload
|
|
}
|
|
if input.Reason == "" {
|
|
input.Reason = "operator disabled route intent"
|
|
}
|
|
item, err := s.store.DisableRouteIntent(ctx, input)
|
|
if err != nil {
|
|
return MeshRouteIntent{}, err
|
|
}
|
|
item = routeIntentWithLifecycle(item, s.now())
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "mesh.route_intent.disabled",
|
|
TargetType: "mesh_route_intent",
|
|
TargetID: &item.ID,
|
|
Payload: mustJSONRaw(map[string]any{"reason": input.Reason}),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func routeIntentsWithLifecycle(items []MeshRouteIntent, now time.Time) []MeshRouteIntent {
|
|
out := make([]MeshRouteIntent, 0, len(items))
|
|
for _, item := range items {
|
|
out = append(out, routeIntentWithLifecycle(item, now))
|
|
}
|
|
return out
|
|
}
|
|
|
|
func routeIntentWithLifecycle(item MeshRouteIntent, now time.Time) MeshRouteIntent {
|
|
item.LifecycleStatus = strings.TrimSpace(item.Status)
|
|
var policy syntheticRoutePolicy
|
|
if err := json.Unmarshal(item.Policy, &policy); err == nil && policy.ExpiresAt != nil {
|
|
expiresAt := policy.ExpiresAt.UTC()
|
|
item.PolicyExpiresAt = &expiresAt
|
|
if !expiresAt.After(now.UTC()) {
|
|
item.IsExpired = true
|
|
}
|
|
}
|
|
switch {
|
|
case item.Status == "disabled":
|
|
item.LifecycleStatus = "disabled"
|
|
case item.IsExpired:
|
|
item.LifecycleStatus = "expired"
|
|
case item.LifecycleStatus == "":
|
|
item.LifecycleStatus = "active"
|
|
}
|
|
return item
|
|
}
|
|
|
|
func (s *Service) ListQoSPolicies(ctx context.Context, actorUserID, clusterID string) ([]MeshQoSPolicy, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListQoSPolicies(ctx, clusterID)
|
|
}
|
|
|
|
func (s *Service) ListFabricEntryPoints(ctx context.Context, actorUserID, clusterID string) ([]FabricEntryPoint, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListFabricEntryPoints(ctx, clusterID)
|
|
}
|
|
|
|
func (s *Service) CreateFabricEntryPoint(ctx context.Context, input CreateFabricEntryPointInput) (FabricEntryPoint, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return FabricEntryPoint{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return FabricEntryPoint{}, err
|
|
}
|
|
input.Name = strings.TrimSpace(input.Name)
|
|
input.Status = strings.TrimSpace(input.Status)
|
|
input.EndpointType = strings.TrimSpace(input.EndpointType)
|
|
if input.Status == "" {
|
|
input.Status = "active"
|
|
}
|
|
if input.EndpointType == "" {
|
|
input.EndpointType = "client_access"
|
|
}
|
|
if input.ClusterID == "" || input.Name == "" || !isFabricEndpointStatus(input.Status) || !isFabricEntryPointType(input.EndpointType) {
|
|
return FabricEntryPoint{}, ErrInvalidPayload
|
|
}
|
|
if input.PublicEndpoint != nil {
|
|
trimmed := strings.TrimSpace(*input.PublicEndpoint)
|
|
if trimmed == "" {
|
|
input.PublicEndpoint = nil
|
|
} else {
|
|
input.PublicEndpoint = &trimmed
|
|
}
|
|
}
|
|
input.Policy = defaultJSON(input.Policy, `{}`)
|
|
input.Metadata = defaultJSON(input.Metadata, `{}`)
|
|
if !json.Valid(input.Policy) || !json.Valid(input.Metadata) {
|
|
return FabricEntryPoint{}, errors.New("entry point policy and metadata must be valid json")
|
|
}
|
|
item, err := s.store.CreateFabricEntryPoint(ctx, input)
|
|
if err != nil {
|
|
return FabricEntryPoint{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "fabric.entry_point.created",
|
|
TargetType: "fabric_entry_point",
|
|
TargetID: &item.ID,
|
|
Payload: json.RawMessage(`{"runtime_routing_enabled":false}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) SetFabricEntryPointNode(ctx context.Context, input SetFabricEntryPointNodeInput) (FabricEntryPointNode, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return FabricEntryPointNode{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return FabricEntryPointNode{}, err
|
|
}
|
|
input.Status = strings.TrimSpace(input.Status)
|
|
if input.Status == "" {
|
|
input.Status = "active"
|
|
}
|
|
if input.Priority <= 0 {
|
|
input.Priority = 100
|
|
}
|
|
if input.ClusterID == "" || input.EntryPointID == "" || input.NodeID == "" || !isFabricEndpointStatus(input.Status) {
|
|
return FabricEntryPointNode{}, ErrInvalidPayload
|
|
}
|
|
input.Metadata = defaultJSON(input.Metadata, `{}`)
|
|
if !json.Valid(input.Metadata) {
|
|
return FabricEntryPointNode{}, errors.New("entry point node metadata must be valid json")
|
|
}
|
|
return s.store.SetFabricEntryPointNode(ctx, input)
|
|
}
|
|
|
|
func (s *Service) ListFabricEntryPointNodes(ctx context.Context, actorUserID, clusterID, entryPointID string) ([]FabricEntryPointNode, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
if clusterID == "" || entryPointID == "" {
|
|
return nil, ErrInvalidPayload
|
|
}
|
|
return s.store.ListFabricEntryPointNodes(ctx, clusterID, entryPointID)
|
|
}
|
|
|
|
func (s *Service) ListFabricEgressPools(ctx context.Context, actorUserID, clusterID string) ([]FabricEgressPool, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListFabricEgressPools(ctx, clusterID)
|
|
}
|
|
|
|
func (s *Service) CreateFabricEgressPool(ctx context.Context, input CreateFabricEgressPoolInput) (FabricEgressPool, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return FabricEgressPool{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return FabricEgressPool{}, err
|
|
}
|
|
input.Name = strings.TrimSpace(input.Name)
|
|
input.Status = strings.TrimSpace(input.Status)
|
|
if input.Status == "" {
|
|
input.Status = "active"
|
|
}
|
|
if input.ClusterID == "" || input.Name == "" || !isFabricEndpointStatus(input.Status) {
|
|
return FabricEgressPool{}, ErrInvalidPayload
|
|
}
|
|
if input.Description != nil {
|
|
trimmed := strings.TrimSpace(*input.Description)
|
|
if trimmed == "" {
|
|
input.Description = nil
|
|
} else {
|
|
input.Description = &trimmed
|
|
}
|
|
}
|
|
input.RouteScope = defaultJSON(input.RouteScope, `{}`)
|
|
input.Policy = defaultJSON(input.Policy, `{}`)
|
|
input.Metadata = defaultJSON(input.Metadata, `{}`)
|
|
if !json.Valid(input.RouteScope) || !json.Valid(input.Policy) || !json.Valid(input.Metadata) {
|
|
return FabricEgressPool{}, errors.New("egress pool route_scope, policy, and metadata must be valid json")
|
|
}
|
|
item, err := s.store.CreateFabricEgressPool(ctx, input)
|
|
if err != nil {
|
|
return FabricEgressPool{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "fabric.egress_pool.created",
|
|
TargetType: "fabric_egress_pool",
|
|
TargetID: &item.ID,
|
|
Payload: json.RawMessage(`{"runtime_routing_enabled":false}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) SetFabricEgressPoolNode(ctx context.Context, input SetFabricEgressPoolNodeInput) (FabricEgressPoolNode, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return FabricEgressPoolNode{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return FabricEgressPoolNode{}, err
|
|
}
|
|
input.Status = strings.TrimSpace(input.Status)
|
|
if input.Status == "" {
|
|
input.Status = "active"
|
|
}
|
|
if input.Priority <= 0 {
|
|
input.Priority = 100
|
|
}
|
|
if input.ClusterID == "" || input.EgressPoolID == "" || input.NodeID == "" || !isFabricEndpointStatus(input.Status) {
|
|
return FabricEgressPoolNode{}, ErrInvalidPayload
|
|
}
|
|
input.Metadata = defaultJSON(input.Metadata, `{}`)
|
|
if !json.Valid(input.Metadata) {
|
|
return FabricEgressPoolNode{}, errors.New("egress pool node metadata must be valid json")
|
|
}
|
|
return s.store.SetFabricEgressPoolNode(ctx, input)
|
|
}
|
|
|
|
func (s *Service) ListFabricEgressPoolNodes(ctx context.Context, actorUserID, clusterID, egressPoolID string) ([]FabricEgressPoolNode, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
if clusterID == "" || egressPoolID == "" {
|
|
return nil, ErrInvalidPayload
|
|
}
|
|
return s.store.ListFabricEgressPoolNodes(ctx, clusterID, egressPoolID)
|
|
}
|
|
|
|
func (s *Service) GetClusterAuthorityState(ctx context.Context, actorUserID, clusterID string) (ClusterAuthorityState, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return ClusterAuthorityState{}, err
|
|
}
|
|
return s.store.GetClusterAuthorityState(ctx, clusterID)
|
|
}
|
|
|
|
func (s *Service) UpdateClusterAuthorityState(ctx context.Context, input UpdateClusterAuthorityInput) (ClusterAuthorityState, error) {
|
|
role, err := s.store.GetPlatformRole(ctx, strings.TrimSpace(input.ActorUserID))
|
|
if err != nil {
|
|
return ClusterAuthorityState{}, err
|
|
}
|
|
if !isPlatformAdminRole(role) {
|
|
return ClusterAuthorityState{}, ErrAccessDenied
|
|
}
|
|
if input.MutationMode == "recovery_override" && role != PlatformRoleRecoveryAdmin {
|
|
return ClusterAuthorityState{}, ErrAccessDenied
|
|
}
|
|
if input.AuthorityState == "" {
|
|
input.AuthorityState = "authoritative"
|
|
}
|
|
if input.MutationMode == "" {
|
|
input.MutationMode = "normal"
|
|
}
|
|
item, err := s.store.UpdateClusterAuthorityState(ctx, input)
|
|
if err != nil {
|
|
return ClusterAuthorityState{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "cluster_authority.updated",
|
|
TargetType: "cluster",
|
|
TargetID: &input.ClusterID,
|
|
Payload: json.RawMessage(`{"split_brain_guard":true}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) ListClusterAdminSummaries(ctx context.Context, actorUserID string) ([]ClusterAdminSummary, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListClusterAdminSummaries(ctx)
|
|
}
|
|
|
|
func (s *Service) CreateVPNConnection(ctx context.Context, input CreateVPNConnectionInput) (VPNConnection, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return VPNConnection{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return VPNConnection{}, err
|
|
}
|
|
input.Name = strings.TrimSpace(input.Name)
|
|
input.ProtocolFamily = strings.TrimSpace(input.ProtocolFamily)
|
|
if input.ProtocolFamily == "" {
|
|
input.ProtocolFamily = "generic"
|
|
}
|
|
input.Mode = strings.TrimSpace(input.Mode)
|
|
if input.Mode == "" {
|
|
input.Mode = VPNConnectionModeSingleActive
|
|
}
|
|
input.DesiredState = strings.TrimSpace(input.DesiredState)
|
|
if input.DesiredState == "" {
|
|
input.DesiredState = VPNConnectionDesiredDisabled
|
|
}
|
|
if input.ClusterID == "" || input.OrganizationID == "" || input.Name == "" {
|
|
return VPNConnection{}, ErrInvalidPayload
|
|
}
|
|
if input.Mode != VPNConnectionModeSingleActive {
|
|
return VPNConnection{}, errors.New("vpn connection mode must be single_active")
|
|
}
|
|
if !isAllowedVPNDesiredState(input.DesiredState) {
|
|
return VPNConnection{}, errors.New("vpn connection desired_state must be enabled or disabled")
|
|
}
|
|
input.TargetEndpoint = defaultJSON(input.TargetEndpoint, `{}`)
|
|
input.AllowedNodePolicy = defaultJSON(input.AllowedNodePolicy, `{"mode":"explicit","node_ids":[]}`)
|
|
input.RoutingUsage = defaultJSON(input.RoutingUsage, `[]`)
|
|
input.RoutePolicy = defaultJSON(input.RoutePolicy, `{}`)
|
|
input.QoSPolicy = defaultJSON(input.QoSPolicy, `{}`)
|
|
input.PlacementPolicy = defaultJSON(input.PlacementPolicy, `{}`)
|
|
input.Metadata = defaultJSON(input.Metadata, `{}`)
|
|
if !json.Valid(input.TargetEndpoint) ||
|
|
!json.Valid(input.AllowedNodePolicy) ||
|
|
!json.Valid(input.RoutingUsage) ||
|
|
!json.Valid(input.RoutePolicy) ||
|
|
!json.Valid(input.QoSPolicy) ||
|
|
!json.Valid(input.PlacementPolicy) ||
|
|
!json.Valid(input.Metadata) {
|
|
return VPNConnection{}, errors.New("vpn connection json fields must be valid json")
|
|
}
|
|
item, err := s.store.CreateVPNConnection(ctx, input)
|
|
if err != nil {
|
|
return VPNConnection{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "vpn_connection.created",
|
|
TargetType: "vpn_connection",
|
|
TargetID: &item.ID,
|
|
Payload: json.RawMessage(`{"runtime_created":false}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) ListVPNConnections(ctx context.Context, actorUserID, clusterID string) ([]VPNConnection, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListVPNConnections(ctx, clusterID)
|
|
}
|
|
|
|
func (s *Service) GetVPNConnection(ctx context.Context, actorUserID, clusterID, vpnConnectionID string) (VPNConnection, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return VPNConnection{}, err
|
|
}
|
|
item, err := s.store.GetVPNConnection(ctx, clusterID, vpnConnectionID)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return VPNConnection{}, ErrInvalidVPNConnection
|
|
}
|
|
return item, err
|
|
}
|
|
|
|
func (s *Service) UpdateVPNConnectionDesiredState(ctx context.Context, input UpdateVPNConnectionDesiredStateInput) (VPNConnection, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return VPNConnection{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return VPNConnection{}, err
|
|
}
|
|
input.DesiredState = strings.TrimSpace(input.DesiredState)
|
|
if !isAllowedVPNDesiredState(input.DesiredState) {
|
|
return VPNConnection{}, errors.New("vpn connection desired_state must be enabled or disabled")
|
|
}
|
|
item, err := s.store.UpdateVPNConnectionDesiredState(ctx, input)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return VPNConnection{}, ErrInvalidVPNConnection
|
|
}
|
|
if err != nil {
|
|
return VPNConnection{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "vpn_connection.desired_state_changed",
|
|
TargetType: "vpn_connection",
|
|
TargetID: &input.VPNConnectionID,
|
|
Payload: json.RawMessage(`{"runtime_executed":false}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) UpsertVPNConnectionRoutePolicy(ctx context.Context, input UpsertVPNConnectionRoutePolicyInput) (VPNConnectionRoutePolicy, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return VPNConnectionRoutePolicy{}, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return VPNConnectionRoutePolicy{}, err
|
|
}
|
|
input.RouteType = strings.TrimSpace(input.RouteType)
|
|
input.Destination = strings.TrimSpace(input.Destination)
|
|
input.Action = strings.TrimSpace(input.Action)
|
|
input.Status = strings.TrimSpace(input.Status)
|
|
if input.Action == "" {
|
|
input.Action = "allow"
|
|
}
|
|
if input.Status == "" {
|
|
input.Status = "active"
|
|
}
|
|
if input.Priority == 0 {
|
|
input.Priority = 100
|
|
}
|
|
if input.ClusterID == "" || input.VPNConnectionID == "" || input.RouteType == "" || input.Destination == "" {
|
|
return VPNConnectionRoutePolicy{}, ErrInvalidPayload
|
|
}
|
|
if !isAllowedVPNRouteType(input.RouteType) || !isAllowedVPNRouteAction(input.Action) || !isAllowedVPNPolicyStatus(input.Status) {
|
|
return VPNConnectionRoutePolicy{}, ErrInvalidPayload
|
|
}
|
|
input.Policy = defaultJSON(input.Policy, `{}`)
|
|
if !json.Valid(input.Policy) {
|
|
return VPNConnectionRoutePolicy{}, errors.New("vpn route policy json must be valid json")
|
|
}
|
|
item, err := s.store.UpsertVPNConnectionRoutePolicy(ctx, input)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return VPNConnectionRoutePolicy{}, ErrInvalidVPNConnection
|
|
}
|
|
if err != nil {
|
|
return VPNConnectionRoutePolicy{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "vpn_connection.route_policy_changed",
|
|
TargetType: "vpn_connection",
|
|
TargetID: &input.VPNConnectionID,
|
|
Payload: json.RawMessage(`{"routing_runtime_changed":false}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) ListVPNConnectionRoutePolicies(ctx context.Context, actorUserID, clusterID, vpnConnectionID string) ([]VPNConnectionRoutePolicy, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListVPNConnectionRoutePolicies(ctx, clusterID, vpnConnectionID)
|
|
}
|
|
|
|
func (s *Service) SetVPNConnectionAllowedNodes(ctx context.Context, input SetVPNConnectionAllowedNodesInput) ([]VPNConnectionAllowedNode, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil {
|
|
return nil, err
|
|
}
|
|
input.RolePreference = strings.TrimSpace(input.RolePreference)
|
|
if input.RolePreference == "" {
|
|
input.RolePreference = "candidate"
|
|
}
|
|
if input.ClusterID == "" || input.VPNConnectionID == "" {
|
|
return nil, ErrInvalidPayload
|
|
}
|
|
if !isAllowedVPNNodePreference(input.RolePreference) {
|
|
return nil, ErrInvalidPayload
|
|
}
|
|
input.Metadata = defaultJSON(input.Metadata, `{}`)
|
|
if !json.Valid(input.Metadata) {
|
|
return nil, errors.New("allowed node metadata must be valid json")
|
|
}
|
|
nodes := make([]string, 0, len(input.NodeIDs))
|
|
seen := map[string]struct{}{}
|
|
for _, nodeID := range input.NodeIDs {
|
|
nodeID = strings.TrimSpace(nodeID)
|
|
if nodeID == "" {
|
|
continue
|
|
}
|
|
if _, ok := seen[nodeID]; ok {
|
|
continue
|
|
}
|
|
seen[nodeID] = struct{}{}
|
|
nodes = append(nodes, nodeID)
|
|
}
|
|
input.NodeIDs = nodes
|
|
items, err := s.store.SetVPNConnectionAllowedNodes(ctx, input)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return nil, ErrInvalidVPNConnection
|
|
}
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "vpn_connection.allowed_nodes_changed",
|
|
TargetType: "vpn_connection",
|
|
TargetID: &input.VPNConnectionID,
|
|
Payload: json.RawMessage(`{"node_runtime_changed":false}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return items, nil
|
|
}
|
|
|
|
func (s *Service) ListVPNConnectionAllowedNodes(ctx context.Context, actorUserID, clusterID, vpnConnectionID string) ([]VPNConnectionAllowedNode, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
return s.store.ListVPNConnectionAllowedNodes(ctx, clusterID, vpnConnectionID)
|
|
}
|
|
|
|
func (s *Service) AcquireVPNConnectionLease(ctx context.Context, input AcquireVPNConnectionLeaseInput) (VPNConnectionLease, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return VPNConnectionLease{}, err
|
|
}
|
|
if input.ClusterID == "" || input.VPNConnectionID == "" || input.OwnerNodeID == "" {
|
|
return VPNConnectionLease{}, ErrInvalidPayload
|
|
}
|
|
conn, err := s.store.GetVPNConnection(ctx, input.ClusterID, input.VPNConnectionID)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return VPNConnectionLease{}, ErrInvalidVPNConnection
|
|
}
|
|
if err != nil {
|
|
return VPNConnectionLease{}, err
|
|
}
|
|
if conn.Mode != VPNConnectionModeSingleActive || conn.DesiredState != VPNConnectionDesiredEnabled {
|
|
return VPNConnectionLease{}, errors.New("vpn connection must be enabled single_active before lease acquisition")
|
|
}
|
|
if err := s.ensureVPNLeaseOwnerEligible(ctx, input.ClusterID, input.VPNConnectionID, input.OwnerNodeID); err != nil {
|
|
return VPNConnectionLease{}, err
|
|
}
|
|
if input.TTL <= 0 {
|
|
input.TTL = 30 * time.Second
|
|
}
|
|
input.Metadata = defaultJSON(input.Metadata, `{}`)
|
|
if !json.Valid(input.Metadata) {
|
|
return VPNConnectionLease{}, errors.New("lease metadata must be valid json")
|
|
}
|
|
token, err := generateFencingToken()
|
|
if err != nil {
|
|
return VPNConnectionLease{}, err
|
|
}
|
|
item, err := s.store.AcquireVPNConnectionLease(ctx, input, s.now().Add(input.TTL), token)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return VPNConnectionLease{}, ErrInvalidVPNLease
|
|
}
|
|
if errors.Is(err, ErrVPNLeaseAlreadyActive) {
|
|
return VPNConnectionLease{}, ErrVPNLeaseAlreadyActive
|
|
}
|
|
if err != nil {
|
|
return VPNConnectionLease{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "vpn_connection.lease_acquired",
|
|
TargetType: "vpn_connection",
|
|
TargetID: &input.VPNConnectionID,
|
|
Payload: json.RawMessage(`{"vpn_runtime_started":false}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) RenewVPNConnectionLease(ctx context.Context, input RenewVPNConnectionLeaseInput) (VPNConnectionLease, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return VPNConnectionLease{}, err
|
|
}
|
|
if input.ClusterID == "" || input.VPNConnectionID == "" || input.LeaseID == "" || input.OwnerNodeID == "" || input.FencingToken == "" {
|
|
return VPNConnectionLease{}, ErrInvalidPayload
|
|
}
|
|
if input.TTL <= 0 {
|
|
input.TTL = 30 * time.Second
|
|
}
|
|
if err := s.ensureVPNLeaseOwnerEligible(ctx, input.ClusterID, input.VPNConnectionID, input.OwnerNodeID); err != nil {
|
|
return VPNConnectionLease{}, err
|
|
}
|
|
item, err := s.store.RenewVPNConnectionLease(ctx, input, s.now().Add(input.TTL))
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return VPNConnectionLease{}, ErrInvalidVPNLease
|
|
}
|
|
if err != nil {
|
|
return VPNConnectionLease{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "vpn_connection.lease_renewed",
|
|
TargetType: "vpn_connection",
|
|
TargetID: &input.VPNConnectionID,
|
|
Payload: json.RawMessage(`{"vpn_runtime_changed":false}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, err
|
|
}
|
|
|
|
func (s *Service) RenewNodeVPNAssignmentLease(ctx context.Context, input RenewNodeVPNAssignmentLeaseInput) (VPNConnectionLease, error) {
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.VPNConnectionID = strings.TrimSpace(input.VPNConnectionID)
|
|
input.LeaseID = strings.TrimSpace(input.LeaseID)
|
|
input.OwnerNodeID = strings.TrimSpace(input.OwnerNodeID)
|
|
if input.ClusterID == "" || input.VPNConnectionID == "" || input.LeaseID == "" || input.OwnerNodeID == "" {
|
|
return VPNConnectionLease{}, ErrInvalidPayload
|
|
}
|
|
if input.TTL <= 0 {
|
|
input.TTL = 2 * time.Minute
|
|
}
|
|
if err := s.ensureVPNLeaseOwnerEligible(ctx, input.ClusterID, input.VPNConnectionID, input.OwnerNodeID); err != nil {
|
|
return VPNConnectionLease{}, err
|
|
}
|
|
assignments, err := s.store.ListNodeVPNAssignments(ctx, input.ClusterID, input.OwnerNodeID)
|
|
if err != nil {
|
|
return VPNConnectionLease{}, err
|
|
}
|
|
ownsVisibleLease := false
|
|
for _, assignment := range assignments {
|
|
if assignment.VPNConnectionID == input.VPNConnectionID &&
|
|
assignment.AssignmentReason == "active_owner" &&
|
|
assignment.ActiveLease != nil &&
|
|
assignment.ActiveLease.LeaseID == input.LeaseID &&
|
|
assignment.ActiveLease.OwnerNodeID == input.OwnerNodeID {
|
|
ownsVisibleLease = true
|
|
break
|
|
}
|
|
}
|
|
if !ownsVisibleLease {
|
|
return VPNConnectionLease{}, ErrVPNLeaseOwnerNotAllowed
|
|
}
|
|
item, err := s.store.RenewNodeVPNAssignmentLease(ctx, input, s.now().Add(input.TTL))
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return VPNConnectionLease{}, ErrInvalidVPNLease
|
|
}
|
|
if err != nil {
|
|
return VPNConnectionLease{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
EventType: "vpn_connection.lease_renewed_by_node",
|
|
TargetType: "vpn_connection",
|
|
TargetID: &input.VPNConnectionID,
|
|
Payload: json.RawMessage(`{"node_agent_runtime_executed":true}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) ReleaseVPNConnectionLease(ctx context.Context, input ReleaseVPNConnectionLeaseInput) (VPNConnectionLease, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return VPNConnectionLease{}, err
|
|
}
|
|
if input.ClusterID == "" || input.VPNConnectionID == "" || input.LeaseID == "" || input.OwnerNodeID == "" || input.FencingToken == "" {
|
|
return VPNConnectionLease{}, ErrInvalidPayload
|
|
}
|
|
item, err := s.store.ReleaseVPNConnectionLease(ctx, input)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return VPNConnectionLease{}, ErrInvalidVPNLease
|
|
}
|
|
if err != nil {
|
|
return VPNConnectionLease{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "vpn_connection.lease_released",
|
|
TargetType: "vpn_connection",
|
|
TargetID: &input.VPNConnectionID,
|
|
Payload: json.RawMessage(`{"vpn_runtime_stopped":false}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) FenceVPNConnectionLease(ctx context.Context, input FenceVPNConnectionLeaseInput) (VPNConnectionLease, error) {
|
|
if err := s.ensurePlatformRecoveryAdmin(ctx, input.ActorUserID); err != nil {
|
|
return VPNConnectionLease{}, err
|
|
}
|
|
input.Reason = strings.TrimSpace(input.Reason)
|
|
if input.Reason == "" {
|
|
input.Reason = "fenced by platform recovery administrator"
|
|
}
|
|
if input.ClusterID == "" || input.VPNConnectionID == "" || input.LeaseID == "" {
|
|
return VPNConnectionLease{}, ErrInvalidPayload
|
|
}
|
|
item, err := s.store.FenceVPNConnectionLease(ctx, input)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return VPNConnectionLease{}, ErrInvalidVPNLease
|
|
}
|
|
if err != nil {
|
|
return VPNConnectionLease{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "vpn_connection.owner_fenced",
|
|
TargetType: "vpn_connection",
|
|
TargetID: &input.VPNConnectionID,
|
|
Payload: json.RawMessage(`{"split_brain_guard":true}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) GetActiveVPNConnectionLease(ctx context.Context, actorUserID, clusterID, vpnConnectionID string) (VPNConnectionLease, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return VPNConnectionLease{}, err
|
|
}
|
|
item, err := s.store.GetActiveVPNConnectionLease(ctx, clusterID, vpnConnectionID)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return VPNConnectionLease{}, ErrInvalidVPNLease
|
|
}
|
|
return item, err
|
|
}
|
|
|
|
func (s *Service) ExpireStaleVPNConnectionLeases(ctx context.Context, input ExpireStaleVPNConnectionLeasesInput) ([]VPNConnectionLease, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
if input.ClusterID == "" {
|
|
return nil, ErrInvalidPayload
|
|
}
|
|
items, err := s.store.ExpireStaleVPNConnectionLeases(ctx, input.ClusterID, s.now())
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
for _, item := range items {
|
|
vpnConnectionID := item.VPNConnectionID
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
ActorUserID: &input.ActorUserID,
|
|
EventType: "vpn_connection.lease_expired",
|
|
TargetType: "vpn_connection",
|
|
TargetID: &vpnConnectionID,
|
|
Payload: json.RawMessage(`{"stale_reclamation":true,"vpn_runtime_changed":false}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
}
|
|
return items, nil
|
|
}
|
|
|
|
func (s *Service) ListNodeVPNAssignments(ctx context.Context, clusterID, nodeID string) ([]NodeVPNAssignment, error) {
|
|
clusterID = strings.TrimSpace(clusterID)
|
|
nodeID = strings.TrimSpace(nodeID)
|
|
if clusterID == "" || nodeID == "" {
|
|
return nil, ErrInvalidPayload
|
|
}
|
|
return s.store.ListNodeVPNAssignments(ctx, clusterID, nodeID)
|
|
}
|
|
|
|
func (s *Service) ReportNodeVPNAssignmentStatus(ctx context.Context, input ReportNodeVPNAssignmentStatusInput) (NodeVPNAssignmentStatus, error) {
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.NodeID = strings.TrimSpace(input.NodeID)
|
|
input.VPNConnectionID = strings.TrimSpace(input.VPNConnectionID)
|
|
input.ObservedStatus = strings.TrimSpace(input.ObservedStatus)
|
|
if input.ClusterID == "" || input.NodeID == "" || input.VPNConnectionID == "" {
|
|
return NodeVPNAssignmentStatus{}, ErrInvalidPayload
|
|
}
|
|
if input.ObservedStatus == "" {
|
|
input.ObservedStatus = VPNAssignmentStatusUnknown
|
|
}
|
|
if !isAllowedVPNAssignmentStatus(input.ObservedStatus) {
|
|
return NodeVPNAssignmentStatus{}, ErrInvalidPayload
|
|
}
|
|
input.StatusPayload = defaultJSON(input.StatusPayload, `{}`)
|
|
if !json.Valid(input.StatusPayload) {
|
|
return NodeVPNAssignmentStatus{}, errors.New("status_payload must be valid json")
|
|
}
|
|
if input.ObservedAt.IsZero() {
|
|
input.ObservedAt = s.now()
|
|
}
|
|
|
|
assignments, err := s.store.ListNodeVPNAssignments(ctx, input.ClusterID, input.NodeID)
|
|
if err != nil {
|
|
return NodeVPNAssignmentStatus{}, err
|
|
}
|
|
visible := false
|
|
for _, assignment := range assignments {
|
|
if assignment.VPNConnectionID == input.VPNConnectionID {
|
|
visible = true
|
|
break
|
|
}
|
|
}
|
|
if !visible {
|
|
return NodeVPNAssignmentStatus{}, ErrVPNLeaseOwnerNotAllowed
|
|
}
|
|
|
|
item, err := s.store.ReportNodeVPNAssignmentStatus(ctx, input)
|
|
if err != nil {
|
|
return NodeVPNAssignmentStatus{}, err
|
|
}
|
|
_ = s.store.RecordAudit(ctx, ClusterAuditEvent{
|
|
ClusterID: &input.ClusterID,
|
|
EventType: "vpn_connection.assignment_status_reported",
|
|
TargetType: "vpn_connection",
|
|
TargetID: &input.VPNConnectionID,
|
|
Payload: json.RawMessage(`{"node_agent_runtime_executed":false}`),
|
|
CreatedAt: s.now(),
|
|
})
|
|
return item, nil
|
|
}
|
|
|
|
func (s *Service) GetVPNClientProfile(
|
|
ctx context.Context,
|
|
clusterID, organizationID, userID string,
|
|
preferredEntryNodeID ...string,
|
|
) (VPNClientProfile, error) {
|
|
clusterID = strings.TrimSpace(clusterID)
|
|
organizationID = strings.TrimSpace(organizationID)
|
|
userID = strings.TrimSpace(userID)
|
|
if clusterID == "" || organizationID == "" || userID == "" {
|
|
return VPNClientProfile{}, ErrInvalidPayload
|
|
}
|
|
preferredEntry := ""
|
|
if len(preferredEntryNodeID) > 0 {
|
|
preferredEntry = strings.TrimSpace(preferredEntryNodeID[0])
|
|
}
|
|
|
|
preferredExit := ""
|
|
if len(preferredEntryNodeID) > 1 {
|
|
preferredExit = strings.TrimSpace(preferredEntryNodeID[1])
|
|
}
|
|
profile, err := s.store.GetVPNClientProfile(ctx, clusterID, organizationID, userID, preferredEntry, preferredExit, s.now().UTC())
|
|
if err != nil {
|
|
return VPNClientProfile{}, err
|
|
}
|
|
if profile.ClusterID == "" {
|
|
profile.ClusterID = clusterID
|
|
}
|
|
if profile.OrganizationID == "" {
|
|
profile.OrganizationID = organizationID
|
|
}
|
|
if profile.UserID == "" {
|
|
profile.UserID = userID
|
|
}
|
|
profile = attachVPNDataplaneSessions(profile, s.now().UTC())
|
|
if err := s.ensureVPNFabricRouteIntents(ctx, clusterID, profile); err != nil {
|
|
return VPNClientProfile{}, err
|
|
}
|
|
profile = s.attachVPNFabricServiceChannelLeases(ctx, profile)
|
|
return profile, nil
|
|
}
|
|
|
|
func (s *Service) attachVPNFabricServiceChannelLeases(ctx context.Context, profile VPNClientProfile) VPNClientProfile {
|
|
for i := range profile.Connections {
|
|
connection := profile.Connections[i]
|
|
route := vpnFabricRouteFromClientConfig(connection.ClientConfig)
|
|
if route.Status != "planned" || route.SelectedEntryNodeID == "" || route.SelectedExitNodeID == "" {
|
|
continue
|
|
}
|
|
entryPool := dedupeStrings(append([]string{}, route.EntryPoolNodeIDs...))
|
|
if len(entryPool) == 0 {
|
|
entryPool = dedupeStrings(append([]string{route.SelectedEntryNodeID}, connection.EntryNodeIDs...))
|
|
}
|
|
exitPool := dedupeStrings(append([]string{}, route.ExitPoolNodeIDs...))
|
|
if len(exitPool) == 0 {
|
|
exitPool = dedupeStrings(append([]string{route.SelectedExitNodeID, connection.ExitNodeID}, connection.AllowedNodeIDs...))
|
|
}
|
|
lease, err := s.IssueFabricServiceChannelLease(ctx, IssueFabricServiceChannelLeaseInput{
|
|
ClusterID: profile.ClusterID,
|
|
OrganizationID: profile.OrganizationID,
|
|
UserID: profile.UserID,
|
|
ResourceID: connection.ID,
|
|
ServiceClass: FabricServiceClassVPNPackets,
|
|
EntryNodeIDs: entryPool,
|
|
ExitNodeIDs: exitPool,
|
|
PreferredEntryNodeID: route.SelectedEntryNodeID,
|
|
PreferredExitNodeID: route.SelectedExitNodeID,
|
|
AllowedChannels: []string{"vpn_packet", "fabric_control", FabricChannelBulk, FabricChannelControl},
|
|
TTL: time.Minute,
|
|
})
|
|
if err != nil {
|
|
profile.Connections[i].ClientConfig = attachVPNFabricServiceChannelError(connection.ClientConfig, err)
|
|
continue
|
|
}
|
|
profile.Connections[i].ClientConfig = attachVPNFabricServiceChannelLease(connection.ClientConfig, lease)
|
|
}
|
|
return profile
|
|
}
|
|
|
|
func attachVPNFabricServiceChannelLease(raw json.RawMessage, lease FabricServiceChannelLease) json.RawMessage {
|
|
var cfg map[string]any
|
|
if err := json.Unmarshal(raw, &cfg); err != nil || cfg == nil {
|
|
cfg = map[string]any{}
|
|
}
|
|
cfg["fabric_service_channel_lease"] = lease
|
|
cfg["fabric_service_channel_status"] = lease.Status
|
|
out, err := json.Marshal(cfg)
|
|
if err != nil {
|
|
return raw
|
|
}
|
|
return out
|
|
}
|
|
|
|
func attachVPNFabricServiceChannelError(raw json.RawMessage, err error) json.RawMessage {
|
|
var cfg map[string]any
|
|
if json.Unmarshal(raw, &cfg) != nil || cfg == nil {
|
|
cfg = map[string]any{}
|
|
}
|
|
cfg["fabric_service_channel_status"] = "error"
|
|
cfg["fabric_service_channel_error"] = err.Error()
|
|
out, marshalErr := json.Marshal(cfg)
|
|
if marshalErr != nil {
|
|
return raw
|
|
}
|
|
return out
|
|
}
|
|
|
|
func attachVPNDataplaneSessions(profile VPNClientProfile, now time.Time) VPNClientProfile {
|
|
for i := range profile.Connections {
|
|
profile.Connections[i].ClientConfig = enrichVPNDataplaneSession(profile, profile.Connections[i], now)
|
|
}
|
|
return profile
|
|
}
|
|
|
|
func enrichVPNDataplaneSession(profile VPNClientProfile, connection VPNClientConnection, now time.Time) json.RawMessage {
|
|
var cfg map[string]any
|
|
if err := json.Unmarshal(connection.ClientConfig, &cfg); err != nil || cfg == nil {
|
|
cfg = map[string]any{}
|
|
}
|
|
route := vpnFabricRouteFromClientConfig(connection.ClientConfig)
|
|
expiresAt := now.Add(time.Minute)
|
|
sessionID := uuidLikeRandom()
|
|
if sessionID == "" {
|
|
sessionID = "vpn-session-" + now.UTC().Format("20060102T150405.000000000Z")
|
|
}
|
|
entryCandidates := vpnDataplaneEntryCandidates(route, connection, cfg)
|
|
transportCandidates := vpnDataplaneTransportCandidates(route, entryCandidates)
|
|
status := "waiting_for_entry_endpoint"
|
|
if route.Status == "planned" && route.SelectedEntryNodeID != "" && route.SelectedExitNodeID != "" {
|
|
status = "ready_for_entry_listener"
|
|
}
|
|
cfg["vpn_dataplane_session"] = map[string]any{
|
|
"schema_version": "rap.vpn_dataplane_session.v1",
|
|
"session_id": sessionID,
|
|
"status": status,
|
|
"issued_at": now,
|
|
"expires_at": expiresAt,
|
|
"cluster_id": profile.ClusterID,
|
|
"organization_id": profile.OrganizationID,
|
|
"user_id": profile.UserID,
|
|
"vpn_connection_id": connection.ID,
|
|
"entry_node_id": route.SelectedEntryNodeID,
|
|
"exit_node_id": route.SelectedExitNodeID,
|
|
"preferred_transport": "fabric_packet_quic_v1",
|
|
"fallback_transport": "backend_http_packet_relay",
|
|
"packet_contract": map[string]any{
|
|
"tunnel_type": "universal_ip_packet",
|
|
"application_protocol_agnostic": true,
|
|
"all_ip_traffic": true,
|
|
"protocol_specific_routing": false,
|
|
},
|
|
"auth": map[string]any{
|
|
"type": "control_plane_issued_bearer",
|
|
"token": "rap_vpn_dps_" + sessionID,
|
|
"token_ttl_seconds": int(expiresAt.Sub(now).Seconds()),
|
|
"node_validation": "entry_node_calls_control_plane_introspection",
|
|
"introspection_path": "/api/v1/clusters/{cluster_id}/vpn/dataplane-sessions/{session_id}/introspect",
|
|
},
|
|
"entry_candidates": entryCandidates,
|
|
"transport_candidates": transportCandidates,
|
|
}
|
|
out, err := json.Marshal(cfg)
|
|
if err != nil {
|
|
return connection.ClientConfig
|
|
}
|
|
return out
|
|
}
|
|
|
|
func vpnDataplaneEntryCandidates(route vpnClientFabricRoute, connection VPNClientConnection, cfg map[string]any) []map[string]any {
|
|
concrete := vpnConcreteEntryCandidatesFromClientConfig(cfg)
|
|
ids := dedupeStrings(append([]string{route.SelectedEntryNodeID}, connection.EntryNodeIDs...))
|
|
out := make([]map[string]any, 0, len(concrete)+len(ids))
|
|
nodesWithConcrete := map[string]struct{}{}
|
|
for _, candidate := range concrete {
|
|
nodeID, _ := candidate["node_id"].(string)
|
|
if nodeID == "" {
|
|
continue
|
|
}
|
|
nodesWithConcrete[nodeID] = struct{}{}
|
|
enriched := make(map[string]any, len(candidate)+4)
|
|
for k, v := range candidate {
|
|
enriched[k] = v
|
|
}
|
|
status := "endpoint_reported"
|
|
if nodeID == route.SelectedEntryNodeID {
|
|
status = "selected_endpoint_reported"
|
|
}
|
|
reachability, _ := enriched["reachability"].(string)
|
|
if nodeID == route.SelectedEntryNodeID && strings.EqualFold(reachability, "public") {
|
|
status = "selected_endpoint_public"
|
|
}
|
|
enriched["status"] = status
|
|
enriched["endpoint_source"] = "node_latest_heartbeat.mesh_endpoint_report"
|
|
enriched["transports"] = []string{"entry_direct_http_v1", "fabric_packet_quic_v1", "fabric_packet_tcp_v1"}
|
|
out = append(out, enriched)
|
|
}
|
|
for _, nodeID := range ids {
|
|
if nodeID == "" {
|
|
continue
|
|
}
|
|
if _, ok := nodesWithConcrete[nodeID]; ok {
|
|
continue
|
|
}
|
|
status := "endpoint_pending"
|
|
if nodeID == route.SelectedEntryNodeID {
|
|
status = "selected_endpoint_pending"
|
|
}
|
|
out = append(out, map[string]any{
|
|
"node_id": nodeID,
|
|
"status": status,
|
|
"transports": []string{"fabric_packet_quic_v1", "fabric_packet_tcp_v1"},
|
|
"endpoint_source": "node_mesh_advertisement_pending",
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func vpnConcreteEntryCandidatesFromClientConfig(cfg map[string]any) []map[string]any {
|
|
raw, ok := cfg["vpn_entry_endpoint_candidates"]
|
|
if !ok {
|
|
return nil
|
|
}
|
|
payload, err := json.Marshal(raw)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
var out []map[string]any
|
|
if err := json.Unmarshal(payload, &out); err != nil {
|
|
return nil
|
|
}
|
|
return out
|
|
}
|
|
|
|
func vpnDataplaneTransportCandidates(route vpnClientFabricRoute, entryCandidates []map[string]any) []map[string]any {
|
|
candidates := []map[string]any{
|
|
{
|
|
"type": "fabric_packet_quic_v1",
|
|
"status": "contract_ready_listener_pending",
|
|
"entry_node_id": route.SelectedEntryNodeID,
|
|
"exit_node_id": route.SelectedExitNodeID,
|
|
"entry_candidates": entryCandidates,
|
|
"application_protocols": []string{"ip"},
|
|
},
|
|
}
|
|
if direct := vpnDirectHTTPEntryTransportCandidate(route, entryCandidates); direct != nil {
|
|
candidates = append(candidates, direct)
|
|
}
|
|
candidates = append(candidates, map[string]any{
|
|
"type": "backend_http_packet_relay",
|
|
"status": "active_fallback",
|
|
"description": "current safe dataplane until entry listener is available",
|
|
})
|
|
return candidates
|
|
}
|
|
|
|
func vpnDirectHTTPEntryTransportCandidate(route vpnClientFabricRoute, entryCandidates []map[string]any) map[string]any {
|
|
var selected []map[string]any
|
|
hasPublic := false
|
|
hasHTTP := false
|
|
hasLocalGatewayShortcut := false
|
|
for _, candidate := range entryCandidates {
|
|
nodeID, _ := candidate["node_id"].(string)
|
|
if route.SelectedEntryNodeID != "" && nodeID != route.SelectedEntryNodeID {
|
|
continue
|
|
}
|
|
apiBaseURL, _ := candidate["api_base_url"].(string)
|
|
address, _ := candidate["address"].(string)
|
|
if apiBaseURL == "" && (strings.HasPrefix(address, "http://") || strings.HasPrefix(address, "https://")) {
|
|
apiBaseURL = strings.TrimRight(address, "/") + "/api/v1"
|
|
candidate["api_base_url"] = apiBaseURL
|
|
}
|
|
if apiBaseURL == "" {
|
|
continue
|
|
}
|
|
hasHTTP = true
|
|
reachability, _ := candidate["reachability"].(string)
|
|
if strings.EqualFold(reachability, "public") {
|
|
hasPublic = true
|
|
}
|
|
if value, ok := candidate["local_gateway_shortcut"].(bool); ok && value {
|
|
hasLocalGatewayShortcut = true
|
|
}
|
|
selected = append(selected, candidate)
|
|
}
|
|
if len(selected) == 0 {
|
|
return nil
|
|
}
|
|
status := "reported_private_or_unverified"
|
|
if hasPublic {
|
|
status = "available"
|
|
} else if hasHTTP {
|
|
status = "http_endpoint_reported_unverified"
|
|
}
|
|
safeClientSwitch := hasPublic
|
|
if route.SelectedEntryNodeID != "" && route.SelectedEntryNodeID == route.SelectedExitNodeID {
|
|
if hasPublic && hasLocalGatewayShortcut {
|
|
status = "available_local_gateway_shortcut"
|
|
safeClientSwitch = true
|
|
} else {
|
|
status = "available_local_gateway_shortcut_pending"
|
|
safeClientSwitch = false
|
|
}
|
|
}
|
|
return map[string]any{
|
|
"type": "entry_direct_http_v1",
|
|
"status": status,
|
|
"entry_node_id": route.SelectedEntryNodeID,
|
|
"exit_node_id": route.SelectedExitNodeID,
|
|
"entry_candidates": selected,
|
|
"application_protocols": []string{"ip"},
|
|
"safe_client_switch": safeClientSwitch,
|
|
}
|
|
}
|
|
|
|
func uuidLikeRandom() string {
|
|
var raw [16]byte
|
|
if _, err := rand.Read(raw[:]); err != nil {
|
|
return ""
|
|
}
|
|
raw[6] = (raw[6] & 0x0f) | 0x40
|
|
raw[8] = (raw[8] & 0x3f) | 0x80
|
|
encoded := hex.EncodeToString(raw[:])
|
|
return encoded[0:8] + "-" + encoded[8:12] + "-" + encoded[12:16] + "-" + encoded[16:20] + "-" + encoded[20:32]
|
|
}
|
|
|
|
func (s *Service) ensureVPNFabricRouteIntents(ctx context.Context, clusterID string, profile VPNClientProfile) error {
|
|
intents, err := s.store.ListRouteIntents(ctx, clusterID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
existing := map[string]bool{}
|
|
for _, intent := range intents {
|
|
source, destination, ok := activeVPNPacketRouteIntent(intent, s.now())
|
|
if !ok {
|
|
continue
|
|
}
|
|
existing[source+"->"+destination] = true
|
|
}
|
|
for _, connection := range profile.Connections {
|
|
route := vpnFabricRouteFromClientConfig(connection.ClientConfig)
|
|
if route.Status != "planned" || route.SelectedEntryNodeID == "" || route.SelectedExitNodeID == "" || route.SelectedEntryNodeID == route.SelectedExitNodeID {
|
|
continue
|
|
}
|
|
pairs := [][2]string{
|
|
{route.SelectedEntryNodeID, route.SelectedExitNodeID},
|
|
{route.SelectedExitNodeID, route.SelectedEntryNodeID},
|
|
}
|
|
for _, pair := range pairs {
|
|
key := pair[0] + "->" + pair[1]
|
|
if existing[key] {
|
|
continue
|
|
}
|
|
if _, err := s.store.CreateRouteIntent(ctx, CreateRouteIntentInput{
|
|
ClusterID: clusterID,
|
|
SourceSelector: mustJSONRaw(map[string]any{"node_id": pair[0]}),
|
|
DestinationSelector: mustJSONRaw(map[string]any{"node_id": pair[1]}),
|
|
ServiceClass: "vpn_packets",
|
|
Priority: 10,
|
|
Policy: mustJSONRaw(vpnFabricRouteIntentPolicy(pair[0], pair[1], s.now().UTC().Add(30*24*time.Hour))),
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
existing[key] = true
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
type vpnClientFabricRoute struct {
|
|
Status string `json:"status"`
|
|
SelectedEntryNodeID string `json:"selected_entry_node_id"`
|
|
SelectedExitNodeID string `json:"selected_exit_node_id"`
|
|
EntryPoolNodeIDs []string `json:"entry_pool_node_ids"`
|
|
ExitPoolNodeIDs []string `json:"exit_pool_node_ids"`
|
|
}
|
|
|
|
func vpnFabricRouteFromClientConfig(raw json.RawMessage) vpnClientFabricRoute {
|
|
var cfg struct {
|
|
Route vpnClientFabricRoute `json:"vpn_fabric_route"`
|
|
}
|
|
if len(raw) == 0 {
|
|
return vpnClientFabricRoute{}
|
|
}
|
|
_ = json.Unmarshal(raw, &cfg)
|
|
return cfg.Route
|
|
}
|
|
|
|
func activeVPNPacketRouteIntent(intent MeshRouteIntent, now time.Time) (string, string, bool) {
|
|
if intent.Status != "active" || intent.ServiceClass != "vpn_packets" {
|
|
return "", "", false
|
|
}
|
|
var policy syntheticRoutePolicy
|
|
if err := json.Unmarshal(intent.Policy, &policy); err != nil || !containsString(policy.AllowedChannels, "vpn_packet") {
|
|
return "", "", false
|
|
}
|
|
if policy.ExpiresAt != nil && !policy.ExpiresAt.After(now.UTC()) {
|
|
return "", "", false
|
|
}
|
|
var source nodeSelector
|
|
var destination nodeSelector
|
|
_ = json.Unmarshal(intent.SourceSelector, &source)
|
|
_ = json.Unmarshal(intent.DestinationSelector, &destination)
|
|
sourceNodeID := firstNodeID(source)
|
|
destinationNodeID := firstNodeID(destination)
|
|
if sourceNodeID == "" || destinationNodeID == "" {
|
|
return "", "", false
|
|
}
|
|
return sourceNodeID, destinationNodeID, true
|
|
}
|
|
|
|
func vpnFabricRouteIntentPolicy(sourceNodeID, destinationNodeID string, expiresAt time.Time) map[string]any {
|
|
version := "vpn-fabric-" + expiresAt.UTC().Format("20060102T150405Z")
|
|
return map[string]any{
|
|
"synthetic_enabled": true,
|
|
"hops": []string{sourceNodeID, destinationNodeID},
|
|
"allowed_channels": []string{"vpn_packet", "fabric_control"},
|
|
"max_ttl": 8,
|
|
"max_hops": 8,
|
|
"expires_at": expiresAt.UTC().Format(time.RFC3339),
|
|
"route_version": version,
|
|
"policy_version": version,
|
|
"peer_directory_version": version,
|
|
"backend_relay_fallback": true,
|
|
"data_plane_preference": "fabric_mesh",
|
|
"route_owner": "vpn_client_profile",
|
|
"route_refresh_required": true,
|
|
"route_refresh_threshold": "24h",
|
|
}
|
|
}
|
|
|
|
func mustJSONRaw(value any) json.RawMessage {
|
|
raw, err := json.Marshal(value)
|
|
if err != nil {
|
|
return json.RawMessage(`{}`)
|
|
}
|
|
return raw
|
|
}
|
|
|
|
func (s *Service) ListAuditEvents(ctx context.Context, actorUserID string, input ListAuditEventsInput) ([]ClusterAuditEvent, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return nil, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
input.EventTypes = trimStringSlice(input.EventTypes)
|
|
input.TargetTypes = trimStringSlice(input.TargetTypes)
|
|
input.Correlation = strings.TrimSpace(input.Correlation)
|
|
events, err := s.store.ListAuditEvents(ctx, input)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if input.Correlation == "fabric_diagnostics" {
|
|
events = s.withFabricDiagnosticsAuditCorrelation(ctx, actorUserID, input.ClusterID, events)
|
|
}
|
|
return events, nil
|
|
}
|
|
|
|
func (s *Service) ListFabricServiceChannelRebuildInvestigationBreadcrumbs(ctx context.Context, actorUserID string, input ListFabricServiceChannelRebuildInvestigationBreadcrumbsInput) (FabricServiceChannelRebuildInvestigationBreadcrumbs, error) {
|
|
if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil {
|
|
return FabricServiceChannelRebuildInvestigationBreadcrumbs{}, err
|
|
}
|
|
input.ClusterID = strings.TrimSpace(input.ClusterID)
|
|
if input.ClusterID == "" {
|
|
return FabricServiceChannelRebuildInvestigationBreadcrumbs{}, ErrInvalidPayload
|
|
}
|
|
if input.Limit <= 0 || input.Limit > 100 {
|
|
input.Limit = 20
|
|
}
|
|
cluster, err := s.store.GetCluster(ctx, input.ClusterID)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return FabricServiceChannelRebuildInvestigationBreadcrumbs{}, ErrInvalidCluster
|
|
}
|
|
if err != nil {
|
|
return FabricServiceChannelRebuildInvestigationBreadcrumbs{}, err
|
|
}
|
|
windowPolicy := fabricServiceChannelBreadcrumbWindowPolicyFromCluster(cluster)
|
|
if input.CurrentWindowSeconds <= 0 {
|
|
input.CurrentWindowSeconds = windowPolicy.CurrentWindowSeconds
|
|
}
|
|
if input.HistoryWindowSeconds <= 0 {
|
|
input.HistoryWindowSeconds = windowPolicy.HistoryWindowSeconds
|
|
}
|
|
if input.HistoryWindowSeconds < input.CurrentWindowSeconds {
|
|
input.HistoryWindowSeconds = input.CurrentWindowSeconds
|
|
}
|
|
events, err := s.ListAuditEvents(ctx, actorUserID, ListAuditEventsInput{
|
|
ClusterID: input.ClusterID,
|
|
EventTypes: []string{
|
|
"fabric.service_channel_rebuild_feedback_breakdown.investigation_opened",
|
|
"fabric.service_channel_rebuild_incident.investigation_opened",
|
|
},
|
|
Correlation: "fabric_diagnostics",
|
|
Limit: input.Limit,
|
|
})
|
|
if err != nil {
|
|
return FabricServiceChannelRebuildInvestigationBreadcrumbs{}, err
|
|
}
|
|
events = withFabricDiagnosticsBreadcrumbFreshness(events, s.now(), input.CurrentWindowSeconds, input.HistoryWindowSeconds)
|
|
summary := summarizeClusterAuditEvents(events)
|
|
return FabricServiceChannelRebuildInvestigationBreadcrumbs{
|
|
ClusterID: input.ClusterID,
|
|
Events: events,
|
|
Summary: summary,
|
|
CurrentWindowSeconds: input.CurrentWindowSeconds,
|
|
HistoryWindowSeconds: input.HistoryWindowSeconds,
|
|
CurrentCount: summary.CountsByBreadcrumbStatus["current"],
|
|
StaleCount: summary.CountsByBreadcrumbStatus["stale"],
|
|
ExpiredCount: summary.CountsByBreadcrumbStatus["expired"],
|
|
}, nil
|
|
}
|
|
|
|
func withFabricDiagnosticsBreadcrumbFreshness(events []ClusterAuditEvent, now time.Time, currentWindowSeconds, historyWindowSeconds int64) []ClusterAuditEvent {
|
|
if len(events) == 0 {
|
|
return events
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
for index := range events {
|
|
if events[index].CorrelationHints == nil {
|
|
events[index].CorrelationHints = &ClusterAuditCorrelationHints{Scope: "fabric_diagnostics"}
|
|
}
|
|
ageSeconds := int64(0)
|
|
if !events[index].CreatedAt.IsZero() {
|
|
ageSeconds = int64(now.Sub(events[index].CreatedAt).Seconds())
|
|
if ageSeconds < 0 {
|
|
ageSeconds = 0
|
|
}
|
|
}
|
|
status := "current"
|
|
if ageSeconds > historyWindowSeconds {
|
|
status = "expired"
|
|
} else if ageSeconds > currentWindowSeconds {
|
|
status = "stale"
|
|
}
|
|
events[index].CorrelationHints.BreadcrumbStatus = status
|
|
events[index].CorrelationHints.BreadcrumbAgeSeconds = ageSeconds
|
|
events[index].CorrelationHints.BreadcrumbCurrentWindow = currentWindowSeconds
|
|
events[index].CorrelationHints.BreadcrumbHistoryWindow = historyWindowSeconds
|
|
}
|
|
return events
|
|
}
|
|
|
|
func (s *Service) withFabricDiagnosticsAuditCorrelation(ctx context.Context, actorUserID, clusterID string, events []ClusterAuditEvent) []ClusterAuditEvent {
|
|
if len(events) == 0 {
|
|
return events
|
|
}
|
|
health, healthErr := s.GetFabricServiceChannelRouteRebuildHealthSummary(ctx, actorUserID, GetFabricServiceChannelRouteRebuildHealthSummaryInput{
|
|
ClusterID: clusterID,
|
|
Limit: 5,
|
|
})
|
|
incidents, incidentsErr := s.ListFabricServiceChannelRouteRebuildIncidents(ctx, actorUserID, ListFabricServiceChannelRouteRebuildIncidentsInput{
|
|
ClusterID: clusterID,
|
|
Limit: 20,
|
|
})
|
|
for index := range events {
|
|
hints := ClusterAuditCorrelationHints{
|
|
Scope: "fabric_diagnostics",
|
|
CurrentDiagnosticStatus: "not_visible",
|
|
}
|
|
if healthErr == nil {
|
|
if breakdown := fabricAuditMatchingFeedbackBreakdown(events[index], health.FeedbackBreakdowns); breakdown != nil {
|
|
hints.CurrentDiagnosticStatus = "breakdown_active"
|
|
hints.FeedbackBreakdown = breakdown
|
|
hints.RecommendedAction = "open_filtered_rebuild_ledger"
|
|
}
|
|
}
|
|
if hints.FeedbackBreakdown == nil && incidentsErr == nil {
|
|
if incident := fabricAuditMatchingRebuildIncident(events[index], incidents); incident != nil {
|
|
hints.CurrentDiagnosticStatus = "incident_visible"
|
|
hints.RebuildIncident = incident
|
|
hints.RecommendedAction = "open_deep_rebuild_ledger"
|
|
}
|
|
}
|
|
events[index].CorrelationHints = &hints
|
|
}
|
|
return events
|
|
}
|
|
|
|
func fabricAuditMatchingFeedbackBreakdown(event ClusterAuditEvent, breakdowns []FabricServiceChannelRouteRebuildFeedbackHealthBreakdown) *FabricServiceChannelRouteRebuildFeedbackHealthBreakdown {
|
|
payload := jsonObject(event.Payload)
|
|
feedbackSource := jsonString(payload, "feedback_source")
|
|
feedbackChannelID := jsonString(payload, "feedback_channel_id")
|
|
feedbackViolationStatus := jsonString(payload, "feedback_violation_status")
|
|
reporterNodeID := jsonString(payload, "reporter_node_id")
|
|
routeID := jsonString(payload, "route_id")
|
|
if feedbackSource == "" && feedbackChannelID == "" && feedbackViolationStatus == "" {
|
|
return nil
|
|
}
|
|
for index := range breakdowns {
|
|
item := breakdowns[index]
|
|
if feedbackSource != "" && item.FeedbackSource != feedbackSource {
|
|
continue
|
|
}
|
|
if feedbackChannelID != "" && item.FeedbackChannelID != feedbackChannelID {
|
|
continue
|
|
}
|
|
if feedbackViolationStatus != "" && item.FeedbackViolationStatus != feedbackViolationStatus {
|
|
continue
|
|
}
|
|
if reporterNodeID != "" && !containsString(item.AffectedReporterNodeIDs, reporterNodeID) {
|
|
continue
|
|
}
|
|
if routeID != "" && !containsString(item.AffectedRouteIDs, routeID) {
|
|
continue
|
|
}
|
|
return &item
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func fabricAuditMatchingRebuildIncident(event ClusterAuditEvent, incidents []FabricServiceChannelRouteRebuildIncident) *FabricServiceChannelRouteRebuildIncident {
|
|
payload := jsonObject(event.Payload)
|
|
reporterNodeID := jsonString(payload, "reporter_node_id")
|
|
routeID := jsonString(payload, "route_id")
|
|
if routeID == "" && event.TargetType == "fabric_service_channel_route_rebuild_incident" && event.TargetID != nil {
|
|
routeID = *event.TargetID
|
|
}
|
|
serviceClass := jsonString(payload, "service_class")
|
|
generation := jsonString(payload, "generation")
|
|
guardStatus := jsonString(payload, "guard_status")
|
|
for index := range incidents {
|
|
item := incidents[index]
|
|
if reporterNodeID != "" && item.ReporterNodeID != reporterNodeID {
|
|
continue
|
|
}
|
|
if routeID != "" && item.RouteID != routeID {
|
|
continue
|
|
}
|
|
if serviceClass != "" && item.ServiceClass != serviceClass {
|
|
continue
|
|
}
|
|
if generation != "" && item.Generation != generation {
|
|
continue
|
|
}
|
|
if guardStatus != "" && item.GuardStatus != guardStatus {
|
|
continue
|
|
}
|
|
if reporterNodeID == "" && routeID == "" && serviceClass == "" && generation == "" && guardStatus == "" {
|
|
continue
|
|
}
|
|
return &item
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func summarizeClusterAuditEvents(events []ClusterAuditEvent) ClusterAuditSummary {
|
|
summary := ClusterAuditSummary{
|
|
TotalCount: len(events),
|
|
CountsByEventType: map[string]int{},
|
|
CountsByTargetType: map[string]int{},
|
|
CountsByCurrentDiagnosticStatus: map[string]int{},
|
|
CountsByFeedbackSource: map[string]int{},
|
|
CountsByFeedbackViolationStatus: map[string]int{},
|
|
CountsByBreadcrumbStatus: map[string]int{},
|
|
}
|
|
for _, event := range events {
|
|
if event.EventType != "" {
|
|
summary.CountsByEventType[event.EventType]++
|
|
}
|
|
if event.TargetType != "" {
|
|
summary.CountsByTargetType[event.TargetType]++
|
|
}
|
|
if event.CreatedAt.After(summary.LatestAt) {
|
|
summary.LatestAt = event.CreatedAt.UTC()
|
|
}
|
|
payload := jsonObject(event.Payload)
|
|
if source := jsonString(payload, "feedback_source"); source != "" {
|
|
summary.CountsByFeedbackSource[source]++
|
|
}
|
|
if status := jsonString(payload, "feedback_violation_status"); status != "" {
|
|
summary.CountsByFeedbackViolationStatus[status]++
|
|
}
|
|
if event.CorrelationHints == nil {
|
|
continue
|
|
}
|
|
if breadcrumbStatus := strings.TrimSpace(event.CorrelationHints.BreadcrumbStatus); breadcrumbStatus != "" {
|
|
summary.CountsByBreadcrumbStatus[breadcrumbStatus]++
|
|
}
|
|
status := firstNonEmptyString(event.CorrelationHints.CurrentDiagnosticStatus, "unknown")
|
|
summary.CountsByCurrentDiagnosticStatus[status]++
|
|
if status == "not_visible" {
|
|
summary.NotVisibleCount++
|
|
} else {
|
|
summary.CorrelatedCount++
|
|
}
|
|
}
|
|
return summary
|
|
}
|
|
|
|
func (s *Service) ensurePlatformAdmin(ctx context.Context, userID string) error {
|
|
userID = strings.TrimSpace(userID)
|
|
if userID == "" {
|
|
return ErrAccessDenied
|
|
}
|
|
role, err := s.store.GetPlatformRole(ctx, userID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !isPlatformAdminRole(role) {
|
|
return ErrAccessDenied
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Service) ensurePlatformRecoveryAdmin(ctx context.Context, userID string) error {
|
|
userID = strings.TrimSpace(userID)
|
|
if userID == "" {
|
|
return ErrAccessDenied
|
|
}
|
|
role, err := s.store.GetPlatformRole(ctx, userID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if role != PlatformRoleRecoveryAdmin {
|
|
return ErrAccessDenied
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Service) ensureClusterMutable(ctx context.Context, actorUserID, clusterID string) error {
|
|
role, err := s.store.GetPlatformRole(ctx, strings.TrimSpace(actorUserID))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if role == PlatformRoleRecoveryAdmin {
|
|
return nil
|
|
}
|
|
state, err := s.store.GetClusterAuthorityState(ctx, clusterID)
|
|
if err != nil {
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return nil
|
|
}
|
|
return err
|
|
}
|
|
if state.AuthorityState != "authoritative" || state.MutationMode != "normal" {
|
|
return ErrClusterReadOnly
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Service) ensureVPNLeaseOwnerEligible(ctx context.Context, clusterID, vpnConnectionID, ownerNodeID string) error {
|
|
eligibility, err := s.store.CheckVPNLeaseOwnerEligibility(ctx, clusterID, vpnConnectionID, ownerNodeID)
|
|
if errors.Is(err, pgx.ErrNoRows) {
|
|
return ErrInvalidVPNConnection
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if eligibility.MembershipStatus != "active" || eligibility.NodeRegistrationStatus != NodeRegistrationActive {
|
|
return ErrVPNLeaseOwnerNotAllowed
|
|
}
|
|
if !eligibility.AllowedByPolicy {
|
|
return ErrVPNLeaseOwnerNotAllowed
|
|
}
|
|
if !eligibility.HasAuthorizedRole {
|
|
return ErrVPNLeaseOwnerRoleRequired
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func defaultJSON(raw json.RawMessage, fallback string) json.RawMessage {
|
|
if len(raw) == 0 {
|
|
return json.RawMessage(fallback)
|
|
}
|
|
return raw
|
|
}
|
|
|
|
func isAllowedVPNDesiredState(state string) bool {
|
|
return state == VPNConnectionDesiredEnabled || state == VPNConnectionDesiredDisabled
|
|
}
|
|
|
|
func isAllowedVPNRouteType(routeType string) bool {
|
|
switch routeType {
|
|
case "cidr", "dns_suffix", "service", "resource":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func isAllowedVPNRouteAction(action string) bool {
|
|
return action == "allow" || action == "deny"
|
|
}
|
|
|
|
func isAllowedVPNPolicyStatus(status string) bool {
|
|
return status == "active" || status == "disabled"
|
|
}
|
|
|
|
func isFabricEndpointStatus(status string) bool {
|
|
switch status {
|
|
case "active", "disabled", "maintenance":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func isFabricEntryPointType(endpointType string) bool {
|
|
switch endpointType {
|
|
case "client_access", "admin", "api", "other":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func isAllowedVPNNodePreference(preference string) bool {
|
|
switch preference {
|
|
case "candidate", "standby", "preferred":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func isAllowedVPNAssignmentStatus(status string) bool {
|
|
switch status {
|
|
case VPNAssignmentStatusNotStarted,
|
|
VPNAssignmentStatusAssigned,
|
|
VPNAssignmentStatusLeaseRequired,
|
|
VPNAssignmentStatusBlocked,
|
|
VPNAssignmentStatusUnknown:
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
type syntheticRoutePolicy struct {
|
|
SyntheticEnabled bool `json:"synthetic_enabled"`
|
|
PeerEndpoints map[string]string `json:"peer_endpoints"`
|
|
PeerEndpointCandidates map[string][]PeerEndpointCandidate `json:"peer_endpoint_candidates"`
|
|
RecoverySeeds []PeerRecoverySeed `json:"recovery_seeds"`
|
|
RendezvousLeases []PeerRendezvousLease `json:"rendezvous_leases"`
|
|
Hops []string `json:"hops"`
|
|
AllowedChannels []string `json:"allowed_channels"`
|
|
MaxTTL int `json:"max_ttl"`
|
|
MaxHops int `json:"max_hops"`
|
|
ExpiresAt *time.Time `json:"expires_at"`
|
|
RouteVersion string `json:"route_version"`
|
|
PolicyVersion string `json:"policy_version"`
|
|
PeerDirectoryVersion string `json:"peer_directory_version"`
|
|
Metadata map[string]any `json:"metadata"`
|
|
}
|
|
|
|
type dockerInstallProfileScope struct {
|
|
BackendURL string `json:"backend_url"`
|
|
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
|
|
ArtifactEndpoints []string `json:"artifact_endpoints"`
|
|
DockerImageArtifactURLs []string `json:"docker_image_artifact_urls"`
|
|
DockerImageArtifactSHA256 string `json:"docker_image_artifact_sha256"`
|
|
DockerImageArtifactSizeBytes int64 `json:"docker_image_artifact_size_bytes"`
|
|
NodeAgentArtifactURLs []string `json:"node_agent_artifact_urls"`
|
|
NodeAgentArtifactSHA256 string `json:"node_agent_artifact_sha256"`
|
|
NodeAgentArtifactSizeBytes int64 `json:"node_agent_artifact_size_bytes"`
|
|
Roles []string `json:"roles"`
|
|
NodeName string `json:"node_name"`
|
|
NodeGroupID string `json:"node_group_id"`
|
|
Image string `json:"image"`
|
|
ContainerName string `json:"container_name"`
|
|
StateDir string `json:"state_dir"`
|
|
InstallDir string `json:"install_dir"`
|
|
StartupMode string `json:"startup_mode"`
|
|
Network string `json:"network"`
|
|
RestartPolicy string `json:"restart_policy"`
|
|
PullImage *bool `json:"pull_image"`
|
|
Replace *bool `json:"replace"`
|
|
DockerVPNGatewayEnabled *bool `json:"docker_vpn_gateway_enabled"`
|
|
WorkloadSupervisionEnabled *bool `json:"workload_supervision_enabled"`
|
|
MeshSyntheticRuntimeEnabled *bool `json:"mesh_synthetic_runtime_enabled"`
|
|
MeshProductionForwardingEnabled *bool `json:"mesh_production_forwarding_enabled"`
|
|
MeshListenAddr string `json:"mesh_listen_addr"`
|
|
MeshListenPortMode string `json:"mesh_listen_port_mode"`
|
|
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
|
|
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
|
|
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
|
|
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
|
|
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
|
|
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
|
|
MeshNATType string `json:"mesh_nat_type"`
|
|
MeshRegion string `json:"mesh_region"`
|
|
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
|
|
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
|
|
EnrollmentPollTimeoutSeconds int `json:"enrollment_poll_timeout_seconds"`
|
|
ProductionObservationSinkCapacity int `json:"production_observation_sink_capacity"`
|
|
}
|
|
|
|
func dockerInstallProfileFromScope(input DockerInstallProfileRequest, scopeRaw json.RawMessage) (DockerInstallProfile, error) {
|
|
var scope dockerInstallProfileScope
|
|
if len(scopeRaw) > 0 {
|
|
if !json.Valid(scopeRaw) {
|
|
return DockerInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
if err := json.Unmarshal(scopeRaw, &scope); err != nil {
|
|
return DockerInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
}
|
|
nodeName := firstNonEmptyString(strings.TrimSpace(input.NodeName), scope.NodeName)
|
|
if nodeName == "" {
|
|
nodeName = "docker-node"
|
|
}
|
|
containerName := firstNonEmptyString(scope.ContainerName, "rap-node-agent-"+safeInstallProfileSlug(nodeName))
|
|
roles := trimStringSlice(scope.Roles)
|
|
profile := DockerInstallProfile{
|
|
SchemaVersion: "rap.docker_install_profile.v1",
|
|
BackendURL: strings.TrimRight(strings.TrimSpace(scope.BackendURL), "/"),
|
|
ControlPlaneEndpoints: trimStringSlice(scope.ControlPlaneEndpoints),
|
|
ArtifactEndpoints: trimEndpointSlice(scope.ArtifactEndpoints),
|
|
Roles: roles,
|
|
NodeName: nodeName,
|
|
Image: firstNonEmptyString(scope.Image, "rap-node-agent:latest"),
|
|
ContainerName: containerName,
|
|
StateDir: firstNonEmptyString(scope.StateDir, "/var/lib/rap/nodes/"+safeInstallProfileSlug(nodeName)),
|
|
Network: firstNonEmptyString(scope.Network, "host"),
|
|
RestartPolicy: firstNonEmptyString(scope.RestartPolicy, "unless-stopped"),
|
|
PullImage: boolPtrValue(scope.PullImage, false),
|
|
Replace: boolPtrValue(scope.Replace, true),
|
|
DockerVPNGatewayEnabled: boolPtrValue(scope.DockerVPNGatewayEnabled, containsString(roles, "vpn-exit")),
|
|
WorkloadSupervisionEnabled: boolPtrValue(scope.WorkloadSupervisionEnabled, false),
|
|
MeshSyntheticRuntimeEnabled: boolPtrValue(scope.MeshSyntheticRuntimeEnabled, true),
|
|
MeshProductionForwardingEnabled: boolPtrValue(scope.MeshProductionForwardingEnabled, false),
|
|
MeshListenAddr: firstNonEmptyString(scope.MeshListenAddr, ":19131"),
|
|
MeshListenPortMode: firstNonEmptyString(strings.ToLower(strings.TrimSpace(scope.MeshListenPortMode)), "auto"),
|
|
MeshListenAutoPortStart: positiveOrDefault(scope.MeshListenAutoPortStart, 19131),
|
|
MeshListenAutoPortEnd: positiveOrDefault(scope.MeshListenAutoPortEnd, 19231),
|
|
MeshAdvertiseEndpoint: strings.TrimRight(strings.TrimSpace(scope.MeshAdvertiseEndpoint), "/"),
|
|
MeshAdvertiseEndpointsJSON: scope.MeshAdvertiseEndpointsJSON,
|
|
MeshAdvertiseTransport: strings.TrimSpace(scope.MeshAdvertiseTransport),
|
|
MeshConnectivityMode: strings.TrimSpace(scope.MeshConnectivityMode),
|
|
MeshNATType: strings.TrimSpace(scope.MeshNATType),
|
|
MeshRegion: strings.TrimSpace(scope.MeshRegion),
|
|
HeartbeatIntervalSeconds: positiveOrDefault(scope.HeartbeatIntervalSeconds, 15),
|
|
EnrollmentPollIntervalSeconds: positiveOrDefault(scope.EnrollmentPollIntervalSeconds, 5),
|
|
EnrollmentPollTimeoutSeconds: nonNegativeOrDefault(scope.EnrollmentPollTimeoutSeconds, 0),
|
|
ProductionObservationSinkCapacity: scope.ProductionObservationSinkCapacity,
|
|
}
|
|
profile.DockerImageArtifact = dockerImageArtifactFromScope(profile.Image, profile.ArtifactEndpoints, scope)
|
|
if profile.BackendURL == "" && len(profile.ControlPlaneEndpoints) > 0 {
|
|
profile.BackendURL = profile.ControlPlaneEndpoints[0]
|
|
}
|
|
if profile.BackendURL == "" {
|
|
return DockerInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
if len(profile.ArtifactEndpoints) == 0 {
|
|
if endpoint := defaultArtifactEndpointFromBackendURL(profile.BackendURL); endpoint != "" {
|
|
profile.ArtifactEndpoints = []string{endpoint}
|
|
profile.DockerImageArtifact = dockerImageArtifactFromScope(profile.Image, profile.ArtifactEndpoints, scope)
|
|
}
|
|
}
|
|
if len(profile.MeshAdvertiseEndpointsJSON) > 0 && !json.Valid(profile.MeshAdvertiseEndpointsJSON) {
|
|
return DockerInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
switch profile.MeshListenPortMode {
|
|
case "manual", "auto", "disabled":
|
|
default:
|
|
return DockerInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
if profile.MeshListenAutoPortStart > profile.MeshListenAutoPortEnd {
|
|
return DockerInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
return profile, nil
|
|
}
|
|
|
|
func windowsInstallProfileFromScope(input DockerInstallProfileRequest, scopeRaw json.RawMessage) (WindowsInstallProfile, error) {
|
|
var scope dockerInstallProfileScope
|
|
if len(scopeRaw) > 0 {
|
|
if !json.Valid(scopeRaw) {
|
|
return WindowsInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
if err := json.Unmarshal(scopeRaw, &scope); err != nil {
|
|
return WindowsInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
}
|
|
nodeName := firstNonEmptyString(strings.TrimSpace(input.NodeName), scope.NodeName)
|
|
if nodeName == "" {
|
|
nodeName = "windows-node"
|
|
}
|
|
profile := WindowsInstallProfile{
|
|
SchemaVersion: "rap.windows_install_profile.v1",
|
|
BackendURL: strings.TrimRight(strings.TrimSpace(scope.BackendURL), "/"),
|
|
ControlPlaneEndpoints: trimStringSlice(scope.ControlPlaneEndpoints),
|
|
ArtifactEndpoints: trimEndpointSlice(scope.ArtifactEndpoints),
|
|
Roles: trimStringSlice(scope.Roles),
|
|
NodeName: nodeName,
|
|
StateDir: firstNonEmptyString(scope.StateDir, `C:\ProgramData\RAP\nodes\`+safeInstallProfileSlug(nodeName)),
|
|
InstallDir: firstNonEmptyString(scope.InstallDir, `C:\Program Files\RAP\`+safeInstallProfileSlug(nodeName)),
|
|
StartupMode: firstNonEmptyString(strings.ToLower(strings.TrimSpace(scope.StartupMode)), "auto"),
|
|
WorkloadSupervisionEnabled: boolPtrValue(scope.WorkloadSupervisionEnabled, false),
|
|
MeshSyntheticRuntimeEnabled: boolPtrValue(scope.MeshSyntheticRuntimeEnabled, true),
|
|
MeshProductionForwardingEnabled: boolPtrValue(scope.MeshProductionForwardingEnabled, false),
|
|
MeshListenAddr: firstNonEmptyString(scope.MeshListenAddr, ":19131"),
|
|
MeshListenPortMode: firstNonEmptyString(strings.ToLower(strings.TrimSpace(scope.MeshListenPortMode)), "auto"),
|
|
MeshListenAutoPortStart: positiveOrDefault(scope.MeshListenAutoPortStart, 19131),
|
|
MeshListenAutoPortEnd: positiveOrDefault(scope.MeshListenAutoPortEnd, 19231),
|
|
MeshAdvertiseEndpoint: strings.TrimRight(strings.TrimSpace(scope.MeshAdvertiseEndpoint), "/"),
|
|
MeshAdvertiseEndpointsJSON: scope.MeshAdvertiseEndpointsJSON,
|
|
MeshAdvertiseTransport: strings.TrimSpace(scope.MeshAdvertiseTransport),
|
|
MeshConnectivityMode: firstNonEmptyString(strings.TrimSpace(scope.MeshConnectivityMode), "outbound_only"),
|
|
MeshNATType: firstNonEmptyString(strings.TrimSpace(scope.MeshNATType), "unknown"),
|
|
MeshRegion: firstNonEmptyString(strings.TrimSpace(scope.MeshRegion), "windows"),
|
|
HeartbeatIntervalSeconds: positiveOrDefault(scope.HeartbeatIntervalSeconds, 15),
|
|
EnrollmentPollIntervalSeconds: positiveOrDefault(scope.EnrollmentPollIntervalSeconds, 5),
|
|
EnrollmentPollTimeoutSeconds: nonNegativeOrDefault(scope.EnrollmentPollTimeoutSeconds, 0),
|
|
ProductionObservationSinkCapacity: scope.ProductionObservationSinkCapacity,
|
|
}
|
|
profile.NodeAgentArtifact = windowsNodeAgentArtifactFromScope(profile.ArtifactEndpoints, scope)
|
|
if profile.BackendURL == "" && len(profile.ControlPlaneEndpoints) > 0 {
|
|
profile.BackendURL = profile.ControlPlaneEndpoints[0]
|
|
}
|
|
if profile.BackendURL == "" {
|
|
return WindowsInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
if len(profile.ArtifactEndpoints) == 0 {
|
|
if endpoint := defaultArtifactEndpointFromBackendURL(profile.BackendURL); endpoint != "" {
|
|
profile.ArtifactEndpoints = []string{endpoint}
|
|
profile.NodeAgentArtifact = windowsNodeAgentArtifactFromScope(profile.ArtifactEndpoints, scope)
|
|
}
|
|
}
|
|
if len(profile.MeshAdvertiseEndpointsJSON) > 0 && !json.Valid(profile.MeshAdvertiseEndpointsJSON) {
|
|
return WindowsInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
switch profile.MeshListenPortMode {
|
|
case "manual", "auto", "disabled":
|
|
default:
|
|
return WindowsInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
switch profile.StartupMode {
|
|
case "auto", "system-task", "user-task", "none":
|
|
default:
|
|
return WindowsInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
if profile.MeshListenAutoPortStart > profile.MeshListenAutoPortEnd {
|
|
return WindowsInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
return profile, nil
|
|
}
|
|
|
|
func linuxInstallProfileFromScope(input DockerInstallProfileRequest, scopeRaw json.RawMessage) (LinuxInstallProfile, error) {
|
|
var scope dockerInstallProfileScope
|
|
if len(scopeRaw) > 0 {
|
|
if !json.Valid(scopeRaw) {
|
|
return LinuxInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
if err := json.Unmarshal(scopeRaw, &scope); err != nil {
|
|
return LinuxInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
}
|
|
nodeName := firstNonEmptyString(strings.TrimSpace(input.NodeName), scope.NodeName)
|
|
if nodeName == "" {
|
|
nodeName = "linux-node"
|
|
}
|
|
slug := safeInstallProfileSlug(nodeName)
|
|
profile := LinuxInstallProfile{
|
|
SchemaVersion: "rap.linux_install_profile.v1",
|
|
BackendURL: strings.TrimRight(strings.TrimSpace(scope.BackendURL), "/"),
|
|
ControlPlaneEndpoints: trimStringSlice(scope.ControlPlaneEndpoints),
|
|
ArtifactEndpoints: trimEndpointSlice(scope.ArtifactEndpoints),
|
|
Roles: trimStringSlice(scope.Roles),
|
|
NodeName: nodeName,
|
|
StateDir: firstNonEmptyString(scope.StateDir, "/var/lib/rap/nodes/"+slug),
|
|
InstallDir: firstNonEmptyString(scope.InstallDir, "/opt/rap/"+slug),
|
|
StartupMode: firstNonEmptyString(strings.ToLower(strings.TrimSpace(scope.StartupMode)), "systemd"),
|
|
WorkloadSupervisionEnabled: boolPtrValue(scope.WorkloadSupervisionEnabled, false),
|
|
MeshSyntheticRuntimeEnabled: boolPtrValue(scope.MeshSyntheticRuntimeEnabled, true),
|
|
MeshProductionForwardingEnabled: boolPtrValue(scope.MeshProductionForwardingEnabled, false),
|
|
MeshListenAddr: firstNonEmptyString(scope.MeshListenAddr, ":19131"),
|
|
MeshListenPortMode: firstNonEmptyString(strings.ToLower(strings.TrimSpace(scope.MeshListenPortMode)), "auto"),
|
|
MeshListenAutoPortStart: positiveOrDefault(scope.MeshListenAutoPortStart, 19131),
|
|
MeshListenAutoPortEnd: positiveOrDefault(scope.MeshListenAutoPortEnd, 19231),
|
|
MeshAdvertiseEndpoint: strings.TrimRight(strings.TrimSpace(scope.MeshAdvertiseEndpoint), "/"),
|
|
MeshAdvertiseEndpointsJSON: scope.MeshAdvertiseEndpointsJSON,
|
|
MeshAdvertiseTransport: firstNonEmptyString(strings.TrimSpace(scope.MeshAdvertiseTransport), "direct_http"),
|
|
MeshConnectivityMode: firstNonEmptyString(strings.TrimSpace(scope.MeshConnectivityMode), "outbound_only"),
|
|
MeshNATType: firstNonEmptyString(strings.TrimSpace(scope.MeshNATType), "unknown"),
|
|
MeshRegion: firstNonEmptyString(strings.TrimSpace(scope.MeshRegion), "linux"),
|
|
HeartbeatIntervalSeconds: positiveOrDefault(scope.HeartbeatIntervalSeconds, 15),
|
|
EnrollmentPollIntervalSeconds: positiveOrDefault(scope.EnrollmentPollIntervalSeconds, 5),
|
|
EnrollmentPollTimeoutSeconds: nonNegativeOrDefault(scope.EnrollmentPollTimeoutSeconds, 0),
|
|
ProductionObservationSinkCapacity: scope.ProductionObservationSinkCapacity,
|
|
}
|
|
profile.NodeAgentArtifact = linuxNodeAgentArtifactFromScope(profile.ArtifactEndpoints, scope)
|
|
if profile.BackendURL == "" && len(profile.ControlPlaneEndpoints) > 0 {
|
|
profile.BackendURL = profile.ControlPlaneEndpoints[0]
|
|
}
|
|
if profile.BackendURL == "" {
|
|
return LinuxInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
if len(profile.ArtifactEndpoints) == 0 {
|
|
if endpoint := defaultArtifactEndpointFromBackendURL(profile.BackendURL); endpoint != "" {
|
|
profile.ArtifactEndpoints = []string{endpoint}
|
|
profile.NodeAgentArtifact = linuxNodeAgentArtifactFromScope(profile.ArtifactEndpoints, scope)
|
|
}
|
|
}
|
|
if len(profile.MeshAdvertiseEndpointsJSON) > 0 && !json.Valid(profile.MeshAdvertiseEndpointsJSON) {
|
|
return LinuxInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
switch profile.MeshListenPortMode {
|
|
case "manual", "auto", "disabled":
|
|
default:
|
|
return LinuxInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
switch profile.StartupMode {
|
|
case "auto", "systemd", "none":
|
|
default:
|
|
return LinuxInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
if profile.MeshListenAutoPortStart > profile.MeshListenAutoPortEnd {
|
|
return LinuxInstallProfile{}, ErrInvalidPayload
|
|
}
|
|
return profile, nil
|
|
}
|
|
|
|
func linuxNodeAgentArtifactFromScope(artifactEndpoints []string, scope dockerInstallProfileScope) *DockerArtifact {
|
|
urls := trimEndpointSlice(scope.NodeAgentArtifactURLs)
|
|
if len(urls) == 0 {
|
|
for _, endpoint := range artifactEndpoints {
|
|
urls = append(urls, strings.TrimRight(endpoint, "/")+"/rap-node-agent-linux-amd64")
|
|
}
|
|
}
|
|
sha256 := strings.TrimSpace(scope.NodeAgentArtifactSHA256)
|
|
sizeBytes := scope.NodeAgentArtifactSizeBytes
|
|
if len(urls) == 0 && sha256 == "" {
|
|
return nil
|
|
}
|
|
return &DockerArtifact{
|
|
Kind: "linux_binary",
|
|
MediaType: "application/octet-stream",
|
|
FileName: "rap-node-agent-linux-amd64",
|
|
URLs: urls,
|
|
SHA256: sha256,
|
|
SizeBytes: sizeBytes,
|
|
}
|
|
}
|
|
|
|
func windowsNodeAgentArtifactFromScope(artifactEndpoints []string, scope dockerInstallProfileScope) *DockerArtifact {
|
|
urls := trimEndpointSlice(scope.NodeAgentArtifactURLs)
|
|
if len(urls) == 0 {
|
|
for _, endpoint := range artifactEndpoints {
|
|
urls = append(urls, strings.TrimRight(endpoint, "/")+"/rap-node-agent-windows-amd64.exe")
|
|
}
|
|
}
|
|
sha256 := strings.TrimSpace(scope.NodeAgentArtifactSHA256)
|
|
sizeBytes := scope.NodeAgentArtifactSizeBytes
|
|
if len(urls) == 0 && sha256 == "" {
|
|
return nil
|
|
}
|
|
return &DockerArtifact{
|
|
Kind: "windows_exe",
|
|
MediaType: "application/vnd.microsoft.portable-executable",
|
|
FileName: "rap-node-agent-windows-amd64.exe",
|
|
URLs: urls,
|
|
SHA256: sha256,
|
|
SizeBytes: sizeBytes,
|
|
}
|
|
}
|
|
|
|
func dockerImageArtifactFromScope(image string, artifactEndpoints []string, scope dockerInstallProfileScope) *DockerArtifact {
|
|
image = strings.TrimSpace(image)
|
|
if image == "" {
|
|
return nil
|
|
}
|
|
fileName := safeArtifactFileName(image) + ".tar"
|
|
urls := trimEndpointSlice(scope.DockerImageArtifactURLs)
|
|
if len(urls) == 0 {
|
|
for _, endpoint := range artifactEndpoints {
|
|
urls = append(urls, strings.TrimRight(endpoint, "/")+"/"+fileName)
|
|
}
|
|
}
|
|
sha256 := strings.TrimSpace(scope.DockerImageArtifactSHA256)
|
|
sizeBytes := scope.DockerImageArtifactSizeBytes
|
|
if len(urls) == 0 && sha256 == "" {
|
|
return nil
|
|
}
|
|
return &DockerArtifact{
|
|
Kind: "docker_image_tar",
|
|
Image: image,
|
|
MediaType: "application/vnd.docker.image.rootfs.diff.tar",
|
|
FileName: fileName,
|
|
URLs: urls,
|
|
SHA256: sha256,
|
|
SizeBytes: sizeBytes,
|
|
}
|
|
}
|
|
|
|
func defaultArtifactEndpointFromBackendURL(backendURL string) string {
|
|
value := strings.TrimRight(strings.TrimSpace(backendURL), "/")
|
|
if value == "" {
|
|
return ""
|
|
}
|
|
for _, suffix := range []string{"/api/v1", "/api"} {
|
|
if strings.HasSuffix(value, suffix) {
|
|
value = strings.TrimSuffix(value, suffix)
|
|
break
|
|
}
|
|
}
|
|
return strings.TrimRight(value, "/") + "/downloads"
|
|
}
|
|
|
|
type heartbeatMeshEndpointReport struct {
|
|
SchemaVersion string `json:"schema_version"`
|
|
ClusterID string `json:"cluster_id"`
|
|
NodeID string `json:"node_id"`
|
|
PeerEndpoint string `json:"peer_endpoint"`
|
|
Transport string `json:"transport"`
|
|
ConnectivityMode string `json:"connectivity_mode"`
|
|
NATType string `json:"nat_type"`
|
|
Region string `json:"region"`
|
|
EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates"`
|
|
ObservedAt *time.Time `json:"observed_at"`
|
|
}
|
|
|
|
type heartbeatRendezvousLeaseReport struct {
|
|
SchemaVersion string `json:"schema_version"`
|
|
ClusterID string `json:"cluster_id"`
|
|
NodeID string `json:"node_id"`
|
|
ObservedAt string `json:"observed_at"`
|
|
Leases []heartbeatRendezvousLeaseDetails `json:"leases"`
|
|
}
|
|
|
|
type heartbeatRendezvousLeaseDetails struct {
|
|
LeaseID string `json:"lease_id"`
|
|
PeerNodeID string `json:"peer_node_id"`
|
|
RelayNodeID string `json:"relay_node_id"`
|
|
RouteIDs []string `json:"route_ids"`
|
|
StaleRelay bool `json:"stale_relay"`
|
|
WithdrawalNeeded bool `json:"withdrawal_needed"`
|
|
ReselectionNeeded bool `json:"reselection_needed"`
|
|
ConnectionState string `json:"connection_state"`
|
|
Reason string `json:"reason"`
|
|
}
|
|
|
|
type meshRouteHealthObservationMetadata struct {
|
|
ObservationType string `json:"observation_type"`
|
|
RouteID string `json:"route_id"`
|
|
RoutePathDecisionApplied bool `json:"route_path_decision_applied"`
|
|
RoutePathDecisionSelectedRelayID string `json:"route_path_decision_selected_relay_id"`
|
|
RoutePathDecisionStaleRelayNodeID string `json:"route_path_decision_stale_relay_node_id"`
|
|
RoutePathDecisionRendezvousPeerNodeID string `json:"route_path_decision_rendezvous_peer_node_id"`
|
|
RoutePathDecisionRendezvousLeaseID string `json:"route_path_decision_rendezvous_lease_id"`
|
|
RoutePathDecisionRendezvousLeaseReason string `json:"route_path_decision_rendezvous_lease_reason"`
|
|
RoutePathDecisionSource string `json:"route_path_decision_source"`
|
|
ExpectedEffectiveHops []string `json:"expected_effective_hops"`
|
|
ObservedAckPath []string `json:"observed_ack_path"`
|
|
RoutePathDriftDetected bool `json:"route_path_drift_detected"`
|
|
FailureReason string `json:"failure_reason"`
|
|
ControlPlaneOnly bool `json:"control_plane_only"`
|
|
ProductionForwarding bool `json:"production_forwarding"`
|
|
ProductionPayloadForwarding bool `json:"production_payload_forwarding"`
|
|
RouteHealthProductionPayloadForwarding bool `json:"route_health_production_payload_forwarding"`
|
|
RouteHealthServicePayloadForwarding bool `json:"route_health_service_payload_forwarding"`
|
|
}
|
|
|
|
type rendezvousRelayFeedbackEntry struct {
|
|
ReporterNodeID string
|
|
RouteIDs []string
|
|
LeaseID string
|
|
PeerNodeID string
|
|
RelayNodeID string
|
|
ConnectionState string
|
|
Reason string
|
|
WithdrawalNeeded bool
|
|
ReselectionNeeded bool
|
|
ObservedAt time.Time
|
|
}
|
|
|
|
type rendezvousRelaySelection struct {
|
|
RelayNodeID string
|
|
Endpoint string
|
|
Score int
|
|
Reasons []string
|
|
}
|
|
|
|
type rendezvousRelayPolicy struct {
|
|
localNodeID string
|
|
now time.Time
|
|
links []MeshLinkObservation
|
|
feedback []rendezvousRelayFeedbackEntry
|
|
withdrawn map[string]RendezvousRelayPolicyDecision
|
|
replacements map[string]RendezvousRelayPolicyDecision
|
|
}
|
|
|
|
const (
|
|
maxScopedRecoverySeeds = 20
|
|
maxScopedRendezvousLeases = 20
|
|
defaultCoreMeshBootstrapPeerTarget = 3
|
|
rendezvousRelayFeedbackMaxAge = 2 * time.Minute
|
|
)
|
|
|
|
type nodeSelector struct {
|
|
NodeID string `json:"node_id"`
|
|
NodeIDs []string `json:"node_ids"`
|
|
}
|
|
|
|
func (s *Service) syntheticRouteFromIntent(input GetNodeSyntheticMeshConfigInput, intent MeshRouteIntent) (SyntheticMeshRouteConfig, map[string]string, map[string][]PeerEndpointCandidate, []PeerRecoverySeed, []PeerRendezvousLease, bool) {
|
|
if intent.Status != "active" {
|
|
return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false
|
|
}
|
|
var policy syntheticRoutePolicy
|
|
if err := json.Unmarshal(intent.Policy, &policy); err != nil {
|
|
return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false
|
|
}
|
|
if !policy.SyntheticEnabled {
|
|
return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false
|
|
}
|
|
var source nodeSelector
|
|
var destination nodeSelector
|
|
_ = json.Unmarshal(intent.SourceSelector, &source)
|
|
_ = json.Unmarshal(intent.DestinationSelector, &destination)
|
|
sourceNodeID := firstNodeID(source)
|
|
destinationNodeID := firstNodeID(destination)
|
|
hops := append([]string{}, policy.Hops...)
|
|
if len(hops) == 0 && sourceNodeID != "" && destinationNodeID != "" {
|
|
hops = []string{sourceNodeID, destinationNodeID}
|
|
}
|
|
if len(hops) < 2 || !containsString(hops, input.NodeID) {
|
|
return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false
|
|
}
|
|
if err := validatePeerEndpointCandidates(policy.PeerEndpointCandidates, hops); err != nil {
|
|
return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false
|
|
}
|
|
if err := validatePeerRecoverySeeds(policy.RecoverySeeds); err != nil {
|
|
return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false
|
|
}
|
|
if err := validatePeerRendezvousLeases(policy.RendezvousLeases, hops, s.now()); err != nil {
|
|
return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false
|
|
}
|
|
if sourceNodeID == "" {
|
|
sourceNodeID = hops[0]
|
|
}
|
|
if destinationNodeID == "" {
|
|
destinationNodeID = hops[len(hops)-1]
|
|
}
|
|
expiresAt := s.now().UTC().Add(5 * time.Minute)
|
|
if policy.ExpiresAt != nil {
|
|
expiresAt = policy.ExpiresAt.UTC()
|
|
}
|
|
if !expiresAt.After(s.now().UTC()) {
|
|
return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false
|
|
}
|
|
allowedChannels := policy.AllowedChannels
|
|
if len(allowedChannels) == 0 {
|
|
allowedChannels = []string{"fabric_control", "route_control"}
|
|
}
|
|
maxTTL := policy.MaxTTL
|
|
if maxTTL <= 0 {
|
|
maxTTL = 8
|
|
}
|
|
maxHops := policy.MaxHops
|
|
if maxHops <= 0 {
|
|
maxHops = 8
|
|
}
|
|
routeVersion := policy.RouteVersion
|
|
if routeVersion == "" {
|
|
routeVersion = intent.UpdatedAt.UTC().Format(time.RFC3339)
|
|
}
|
|
policyVersion := policy.PolicyVersion
|
|
if policyVersion == "" {
|
|
policyVersion = routeVersion
|
|
}
|
|
peerDirectoryVersion := policy.PeerDirectoryVersion
|
|
if peerDirectoryVersion == "" {
|
|
peerDirectoryVersion = routeVersion
|
|
}
|
|
route := SyntheticMeshRouteConfig{
|
|
RouteID: intent.ID,
|
|
ClusterID: input.ClusterID,
|
|
SourceNodeID: sourceNodeID,
|
|
DestinationNodeID: destinationNodeID,
|
|
Hops: hops,
|
|
AllowedChannels: allowedChannels,
|
|
ExpiresAt: expiresAt,
|
|
MaxTTL: maxTTL,
|
|
MaxHops: maxHops,
|
|
RouteVersion: routeVersion,
|
|
PolicyVersion: policyVersion,
|
|
PeerDirectoryVersion: peerDirectoryVersion,
|
|
}
|
|
return route,
|
|
scopedPeerEndpoints(policy.PeerEndpoints, hops),
|
|
scopedPeerEndpointCandidates(policy.PeerEndpointCandidates, hops),
|
|
policy.RecoverySeeds,
|
|
normalizeRendezvousLeases(policy.RendezvousLeases, route, s.now()),
|
|
true
|
|
}
|
|
|
|
func (s *Service) reportedEndpointConfig(ctx context.Context, clusterID string, localNodeID string, routePath []string, localPerspective endpointPerspective) (map[string]string, map[string][]PeerEndpointCandidate, error) {
|
|
peers := map[string]string{}
|
|
candidates := map[string][]PeerEndpointCandidate{}
|
|
for _, nodeID := range routePath {
|
|
nodeID = strings.TrimSpace(nodeID)
|
|
if nodeID == "" || nodeID == localNodeID {
|
|
continue
|
|
}
|
|
desiredEndpoint, desiredCandidates, err := s.desiredMeshListenerEndpointConfig(ctx, clusterID, nodeID, 0)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 1)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if len(heartbeats) == 0 && desiredEndpoint == "" && len(desiredCandidates) == 0 {
|
|
continue
|
|
}
|
|
peerEndpoint := desiredEndpoint
|
|
nodeCandidates := append([]PeerEndpointCandidate{}, desiredCandidates...)
|
|
if len(heartbeats) > 0 {
|
|
reportedEndpoint, reportedCandidates, ok := endpointReportFromHeartbeat(heartbeats[0])
|
|
if ok {
|
|
if peerEndpoint == "" {
|
|
peerEndpoint = reportedEndpoint
|
|
}
|
|
nodeCandidates = append(nodeCandidates, reportedCandidates...)
|
|
}
|
|
}
|
|
peerEndpoint, nodeCandidates = scopeEndpointReportForLocal(localPerspective, peerEndpoint, nodeCandidates)
|
|
if peerEndpoint != "" {
|
|
peers[nodeID] = peerEndpoint
|
|
}
|
|
if len(nodeCandidates) > 0 {
|
|
candidates[nodeID] = append(candidates[nodeID], nodeCandidates...)
|
|
}
|
|
}
|
|
return peers, candidates, nil
|
|
}
|
|
|
|
type endpointPerspective struct {
|
|
OutboundOnly bool
|
|
Region string
|
|
ControlPlaneURL string
|
|
ControlPlaneRelayEndpoint string
|
|
}
|
|
|
|
func (s *Service) localEndpointPerspective(ctx context.Context, clusterID, localNodeID string) (endpointPerspective, error) {
|
|
heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, localNodeID, 1)
|
|
if err != nil {
|
|
return endpointPerspective{}, err
|
|
}
|
|
if len(heartbeats) == 0 {
|
|
return endpointPerspective{}, nil
|
|
}
|
|
return endpointPerspectiveFromHeartbeat(heartbeats[0]), nil
|
|
}
|
|
|
|
func endpointPerspectiveFromHeartbeat(heartbeat NodeHeartbeat) endpointPerspective {
|
|
var metadata struct {
|
|
MeshEndpointReport heartbeatMeshEndpointReport `json:"mesh_endpoint_report"`
|
|
MeshListenerReport struct {
|
|
InboundReachability string `json:"inbound_reachability"`
|
|
OneWayConnectivity bool `json:"one_way_connectivity"`
|
|
} `json:"mesh_listener_report"`
|
|
MeshOutboundSessionReport struct {
|
|
ControlPlaneURL string `json:"control_plane_url"`
|
|
Status string `json:"status"`
|
|
} `json:"mesh_outbound_session_report"`
|
|
}
|
|
if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) {
|
|
return endpointPerspective{}
|
|
}
|
|
if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil {
|
|
return endpointPerspective{}
|
|
}
|
|
connectivity := strings.ToLower(strings.TrimSpace(metadata.MeshEndpointReport.ConnectivityMode))
|
|
reachability := strings.ToLower(strings.TrimSpace(metadata.MeshListenerReport.InboundReachability))
|
|
return endpointPerspective{
|
|
OutboundOnly: connectivity == "outbound_only" || reachability == "outbound_only" || metadata.MeshListenerReport.OneWayConnectivity,
|
|
Region: strings.TrimSpace(metadata.MeshEndpointReport.Region),
|
|
ControlPlaneURL: strings.TrimSpace(metadata.MeshOutboundSessionReport.ControlPlaneURL),
|
|
ControlPlaneRelayEndpoint: controlPlaneRelayEndpointFromURL(metadata.MeshOutboundSessionReport.ControlPlaneURL),
|
|
}
|
|
}
|
|
|
|
func controlPlaneRelayEndpointFromURL(raw string) string {
|
|
raw = strings.TrimRight(strings.TrimSpace(raw), "/")
|
|
if raw == "" {
|
|
return ""
|
|
}
|
|
parsed, err := url.Parse(raw)
|
|
if err != nil || parsed.Scheme == "" || parsed.Host == "" {
|
|
return ""
|
|
}
|
|
path := strings.TrimRight(parsed.Path, "/")
|
|
for _, suffix := range []string{"/api/v1", "/api"} {
|
|
if strings.HasSuffix(path, suffix) {
|
|
path = strings.TrimRight(strings.TrimSuffix(path, suffix), "/")
|
|
break
|
|
}
|
|
}
|
|
parsed.Path = path
|
|
parsed.RawPath = ""
|
|
parsed.RawQuery = ""
|
|
parsed.Fragment = ""
|
|
return strings.TrimRight(parsed.String(), "/")
|
|
}
|
|
|
|
func controlPlaneBootstrapRendezvousLease(clusterID, peerNodeID string, candidates []PeerEndpointCandidate, local endpointPerspective, now time.Time) (PeerRendezvousLease, bool) {
|
|
if !local.OutboundOnly || local.ControlPlaneRelayEndpoint == "" {
|
|
return PeerRendezvousLease{}, false
|
|
}
|
|
requiresRendezvous := false
|
|
for _, candidate := range candidates {
|
|
if endpointCandidateRequiresRendezvous(candidate) {
|
|
requiresRendezvous = true
|
|
break
|
|
}
|
|
}
|
|
if !requiresRendezvous {
|
|
return PeerRendezvousLease{}, false
|
|
}
|
|
issuedAt := now.UTC()
|
|
return PeerRendezvousLease{
|
|
LeaseID: "core-mesh-bootstrap-rv-" + peerNodeID + "-via-control-plane",
|
|
PeerNodeID: peerNodeID,
|
|
RelayNodeID: "control-plane-relay",
|
|
RelayEndpoint: local.ControlPlaneRelayEndpoint,
|
|
Transport: "relay_control",
|
|
ConnectivityMode: "relay_required",
|
|
RouteIDs: []string{"core-mesh-bootstrap"},
|
|
AllowedChannels: []string{"fabric_control", "route_control"},
|
|
Priority: 90,
|
|
ControlPlaneOnly: true,
|
|
IssuedAt: issuedAt,
|
|
ExpiresAt: issuedAt.Add(5 * time.Minute),
|
|
Reason: "control_plane_bootstrap_relay",
|
|
Metadata: json.RawMessage(`{
|
|
"cluster_id": "` + clusterID + `",
|
|
"source": "control_plane_bootstrap",
|
|
"service_workload_traffic": false,
|
|
"production_forwarding": false
|
|
}`),
|
|
}, true
|
|
}
|
|
|
|
func scopeEndpointReportForLocal(local endpointPerspective, endpoint string, candidates []PeerEndpointCandidate) (string, []PeerEndpointCandidate) {
|
|
if !local.OutboundOnly {
|
|
return endpoint, candidates
|
|
}
|
|
out := make([]PeerEndpointCandidate, 0, len(candidates))
|
|
directUsable := false
|
|
for _, candidate := range candidates {
|
|
if endpointCandidatePrivateForOffsite(candidate) {
|
|
candidate = relayRequiredCandidateForOffsite(candidate)
|
|
} else if !endpointCandidateRequiresRendezvous(candidate) {
|
|
directUsable = true
|
|
}
|
|
if candidate.Metadata == nil {
|
|
candidate.Metadata = json.RawMessage(`{}`)
|
|
}
|
|
out = append(out, candidate)
|
|
}
|
|
if !directUsable && endpointPrivateForOffsite(endpoint) {
|
|
endpoint = ""
|
|
}
|
|
return endpoint, out
|
|
}
|
|
|
|
func endpointCandidatePrivateForOffsite(candidate PeerEndpointCandidate) bool {
|
|
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
|
|
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
|
|
return connectivity == "private_lan" || reachability == "private" || endpointPrivateForOffsite(candidate.Address)
|
|
}
|
|
|
|
func endpointPrivateForOffsite(endpoint string) bool {
|
|
host := peerEndpointHost(endpoint)
|
|
if host == "" {
|
|
return false
|
|
}
|
|
ip := net.ParseIP(host)
|
|
return ip != nil && (ip.IsPrivate() || ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsUnspecified())
|
|
}
|
|
|
|
func relayRequiredCandidateForOffsite(candidate PeerEndpointCandidate) PeerEndpointCandidate {
|
|
candidate.Transport = "relay"
|
|
candidate.Reachability = "relay"
|
|
candidate.ConnectivityMode = "relay_required"
|
|
candidate.NATType = firstNonEmptyString(candidate.NATType, "unknown")
|
|
candidate.Priority += 200
|
|
candidate.PolicyTags = appendMissingString(candidate.PolicyTags, "offsite-private-lan-blocked")
|
|
candidate.PolicyTags = appendMissingString(candidate.PolicyTags, "relay-required")
|
|
return candidate
|
|
}
|
|
|
|
func endpointReportFromHeartbeat(heartbeat NodeHeartbeat) (string, []PeerEndpointCandidate, bool) {
|
|
var metadata struct {
|
|
MeshEndpointReport heartbeatMeshEndpointReport `json:"mesh_endpoint_report"`
|
|
}
|
|
if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) {
|
|
return "", nil, false
|
|
}
|
|
if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil {
|
|
return "", nil, false
|
|
}
|
|
report := metadata.MeshEndpointReport
|
|
if report.NodeID != "" && report.NodeID != heartbeat.NodeID {
|
|
return "", nil, false
|
|
}
|
|
if report.ClusterID != "" && report.ClusterID != heartbeat.ClusterID {
|
|
return "", nil, false
|
|
}
|
|
nodeID := heartbeat.NodeID
|
|
rawPeerEndpoint := strings.TrimSpace(report.PeerEndpoint)
|
|
peerEndpoint := rawPeerEndpoint
|
|
if isUnusableLocalPeerEndpoint(peerEndpoint) {
|
|
peerEndpoint = ""
|
|
}
|
|
out := make([]PeerEndpointCandidate, 0, len(report.EndpointCandidates))
|
|
for _, candidate := range report.EndpointCandidates {
|
|
if candidate.NodeID == "" {
|
|
candidate.NodeID = nodeID
|
|
}
|
|
if candidate.EndpointID == "" {
|
|
candidate.EndpointID = nodeID + "-reported"
|
|
}
|
|
if candidate.Address == "" {
|
|
candidate.Address = rawPeerEndpoint
|
|
}
|
|
if isUnusableLocalPeerEndpoint(candidate.Address) {
|
|
continue
|
|
}
|
|
if candidate.Transport == "" {
|
|
candidate.Transport = report.Transport
|
|
}
|
|
if candidate.ConnectivityMode == "" {
|
|
candidate.ConnectivityMode = report.ConnectivityMode
|
|
}
|
|
if candidate.NATType == "" {
|
|
candidate.NATType = report.NATType
|
|
}
|
|
if candidate.Region == "" {
|
|
candidate.Region = report.Region
|
|
}
|
|
if candidate.Reachability == "" {
|
|
candidate.Reachability = reachabilityFromConnectivityMode(candidate.ConnectivityMode)
|
|
}
|
|
if candidate.Metadata == nil {
|
|
candidate.Metadata = json.RawMessage(`{}`)
|
|
}
|
|
if candidate.NodeID != nodeID {
|
|
return "", nil, false
|
|
}
|
|
out = append(out, candidate)
|
|
}
|
|
if len(out) > 0 {
|
|
if err := validatePeerEndpointCandidates(map[string][]PeerEndpointCandidate{nodeID: out}, []string{nodeID}); err != nil {
|
|
return "", nil, false
|
|
}
|
|
}
|
|
return peerEndpoint, out, peerEndpoint != "" || len(out) > 0
|
|
}
|
|
|
|
func hasActiveNodeRole(roles []NodeRoleAssignment, role string) bool {
|
|
for _, item := range roles {
|
|
if item.Role == role && item.Status == "active" {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func nodeLastSeen(node ClusterNode) time.Time {
|
|
if node.LastSeenAt == nil {
|
|
return time.Time{}
|
|
}
|
|
return node.LastSeenAt.UTC()
|
|
}
|
|
|
|
func recoverySeedFromEndpointReport(nodeID, endpoint string, candidates []PeerEndpointCandidate, index int) PeerRecoverySeed {
|
|
nodeID = strings.TrimSpace(nodeID)
|
|
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
|
|
seed := PeerRecoverySeed{
|
|
NodeID: nodeID,
|
|
Endpoint: endpoint,
|
|
Transport: "direct_http",
|
|
Priority: 10 + index,
|
|
Metadata: json.RawMessage(`{"source":"core_mesh_bootstrap"}`),
|
|
}
|
|
for _, candidate := range candidates {
|
|
if strings.TrimSpace(candidate.Address) == "" {
|
|
continue
|
|
}
|
|
seed.Endpoint = strings.TrimRight(strings.TrimSpace(candidate.Address), "/")
|
|
if strings.TrimSpace(candidate.Transport) != "" {
|
|
seed.Transport = candidate.Transport
|
|
}
|
|
seed.ConnectivityMode = candidate.ConnectivityMode
|
|
seed.Region = candidate.Region
|
|
if candidate.LastVerifiedAt != nil {
|
|
seed.LastVerifiedAt = candidate.LastVerifiedAt
|
|
}
|
|
break
|
|
}
|
|
if seed.NodeID == "" || seed.Endpoint == "" {
|
|
return PeerRecoverySeed{}
|
|
}
|
|
return seed
|
|
}
|
|
|
|
func firstNonEmptyString(values ...string) string {
|
|
for _, value := range values {
|
|
if trimmed := strings.TrimSpace(value); trimmed != "" {
|
|
return trimmed
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func trimStringSlice(values []string) []string {
|
|
out := []string{}
|
|
for _, value := range values {
|
|
if trimmed := strings.TrimSpace(value); trimmed != "" && !containsString(out, trimmed) {
|
|
out = append(out, trimmed)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func trimEndpointSlice(values []string) []string {
|
|
out := []string{}
|
|
for _, value := range values {
|
|
trimmed := strings.TrimRight(strings.TrimSpace(value), "/")
|
|
if trimmed != "" && !containsString(out, trimmed) {
|
|
out = append(out, trimmed)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func normalizeUpdateToken(value string) string {
|
|
return strings.ToLower(strings.TrimSpace(value))
|
|
}
|
|
|
|
func selectReleaseArtifact(releases []ReleaseVersion, input GetNodeUpdatePlanInput, policy NodeUpdatePolicy) (ReleaseVersion, ReleaseArtifact, bool) {
|
|
targetVersion := ""
|
|
if policy.TargetVersion != nil {
|
|
targetVersion = strings.TrimSpace(*policy.TargetVersion)
|
|
}
|
|
for _, release := range releases {
|
|
if release.Status != "active" {
|
|
continue
|
|
}
|
|
if targetVersion != "" && release.Version != targetVersion {
|
|
continue
|
|
}
|
|
for _, artifact := range release.Artifacts {
|
|
if normalizeUpdateToken(artifact.OS) == input.OS &&
|
|
normalizeUpdateToken(artifact.Arch) == input.Arch &&
|
|
normalizeUpdateToken(artifact.InstallType) == input.InstallType {
|
|
artifact.URLs = releaseArtifactURLs(artifact)
|
|
return release, artifact, true
|
|
}
|
|
}
|
|
}
|
|
return ReleaseVersion{}, ReleaseArtifact{}, false
|
|
}
|
|
|
|
func releaseArtifactURLs(artifact ReleaseArtifact) []string {
|
|
out := trimEndpointSlice(append([]string{artifact.URL}, artifact.URLs...))
|
|
if len(artifact.Metadata) > 0 && json.Valid(artifact.Metadata) {
|
|
var metadata struct {
|
|
URL string `json:"url"`
|
|
URLs []string `json:"urls"`
|
|
MirrorURLs []string `json:"mirror_urls"`
|
|
Mirrors []string `json:"mirrors"`
|
|
}
|
|
if err := json.Unmarshal(artifact.Metadata, &metadata); err == nil {
|
|
out = trimEndpointSlice(append(out, metadata.URL))
|
|
out = trimEndpointSlice(append(out, metadata.URLs...))
|
|
out = trimEndpointSlice(append(out, metadata.MirrorURLs...))
|
|
out = trimEndpointSlice(append(out, metadata.Mirrors...))
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func normalizeArtifactOrigin(value string) string {
|
|
value = strings.TrimRight(strings.TrimSpace(value), "/")
|
|
if value == "" {
|
|
return ""
|
|
}
|
|
parsed, err := url.Parse(value)
|
|
if err != nil || parsed.Scheme == "" || parsed.Host == "" {
|
|
return ""
|
|
}
|
|
return parsed.Scheme + "://" + parsed.Host
|
|
}
|
|
|
|
func absolutizeReleaseArtifact(artifact ReleaseArtifact, origin string) ReleaseArtifact {
|
|
if origin == "" {
|
|
return artifact
|
|
}
|
|
artifact.URL = absolutizeArtifactURL(artifact.URL, origin)
|
|
for i, raw := range artifact.URLs {
|
|
artifact.URLs[i] = absolutizeArtifactURL(raw, origin)
|
|
}
|
|
return artifact
|
|
}
|
|
|
|
func absolutizeArtifactURL(raw, origin string) string {
|
|
raw = strings.TrimSpace(raw)
|
|
if raw == "" || origin == "" {
|
|
return raw
|
|
}
|
|
parsed, err := url.Parse(raw)
|
|
if err == nil && parsed.IsAbs() {
|
|
return raw
|
|
}
|
|
if strings.HasPrefix(raw, "/") {
|
|
return origin + raw
|
|
}
|
|
return raw
|
|
}
|
|
|
|
func (s *Service) hostAgentPlatformMismatch(ctx context.Context, input GetNodeUpdatePlanInput) (bool, error) {
|
|
if input.Product != "rap-host-agent" {
|
|
return false, nil
|
|
}
|
|
if nodeUpdateRequestIsWindows(input) {
|
|
return false, nil
|
|
}
|
|
statuses, err := s.store.ListNodeUpdateStatuses(ctx, input.ClusterID, input.NodeID, 20)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
for _, status := range statuses {
|
|
if status.Product != "rap-node-agent" || !nodeUpdateStatusLooksWindows(status) {
|
|
continue
|
|
}
|
|
return true, nil
|
|
}
|
|
return false, nil
|
|
}
|
|
|
|
func nodeUpdateRequestIsWindows(input GetNodeUpdatePlanInput) bool {
|
|
return normalizeUpdateToken(input.OS) == "windows" || strings.Contains(normalizeUpdateToken(input.InstallType), "windows")
|
|
}
|
|
|
|
func nodeUpdateStatusLooksWindows(status NodeUpdateStatus) bool {
|
|
var payload map[string]any
|
|
if len(status.Payload) == 0 || json.Unmarshal(status.Payload, &payload) != nil {
|
|
return false
|
|
}
|
|
for _, key := range []string{"os", "runtime_os", "goos"} {
|
|
if normalizeUpdateToken(stringFromAny(payload[key])) == "windows" {
|
|
return true
|
|
}
|
|
}
|
|
for _, key := range []string{"binary_path", "task", "windows_task_name"} {
|
|
value := strings.ToLower(strings.TrimSpace(stringFromAny(payload[key])))
|
|
if strings.Contains(value, `:\`) || strings.Contains(value, `.exe`) || strings.Contains(value, "rap node agent ") {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func stringFromAny(value any) string {
|
|
switch typed := value.(type) {
|
|
case string:
|
|
return typed
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
func boolPtrValue(value *bool, fallback bool) bool {
|
|
if value == nil {
|
|
return fallback
|
|
}
|
|
return *value
|
|
}
|
|
|
|
func positiveOrDefault(value, fallback int) int {
|
|
if value > 0 {
|
|
return value
|
|
}
|
|
return fallback
|
|
}
|
|
|
|
func nonNegativeOrDefault(value, fallback int) int {
|
|
if value >= 0 {
|
|
return value
|
|
}
|
|
return fallback
|
|
}
|
|
|
|
func safeInstallProfileSlug(value string) string {
|
|
value = strings.ToLower(strings.TrimSpace(value))
|
|
var b strings.Builder
|
|
lastDash := false
|
|
for _, r := range value {
|
|
ok := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
|
|
if ok {
|
|
b.WriteRune(r)
|
|
lastDash = false
|
|
continue
|
|
}
|
|
if !lastDash {
|
|
b.WriteByte('-')
|
|
lastDash = true
|
|
}
|
|
}
|
|
return strings.Trim(b.String(), "-")
|
|
}
|
|
|
|
func safeArtifactFileName(value string) string {
|
|
value = strings.ToLower(strings.TrimSpace(value))
|
|
var b strings.Builder
|
|
lastDash := false
|
|
for _, r := range value {
|
|
ok := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '.' || r == '_' || r == '-'
|
|
if ok {
|
|
b.WriteRune(r)
|
|
lastDash = false
|
|
continue
|
|
}
|
|
if !lastDash {
|
|
b.WriteByte('-')
|
|
lastDash = true
|
|
}
|
|
}
|
|
out := strings.Trim(b.String(), "-")
|
|
if out == "" {
|
|
return "rap-node-agent"
|
|
}
|
|
return out
|
|
}
|
|
|
|
func (s *Service) rendezvousRelayFeedback(ctx context.Context, clusterID string, routePath []string, now time.Time) ([]rendezvousRelayFeedbackEntry, error) {
|
|
out := []rendezvousRelayFeedbackEntry{}
|
|
seenNodes := map[string]struct{}{}
|
|
for _, nodeID := range routePath {
|
|
nodeID = strings.TrimSpace(nodeID)
|
|
if nodeID == "" {
|
|
continue
|
|
}
|
|
if _, duplicate := seenNodes[nodeID]; duplicate {
|
|
continue
|
|
}
|
|
seenNodes[nodeID] = struct{}{}
|
|
heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 1)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if len(heartbeats) == 0 {
|
|
continue
|
|
}
|
|
out = append(out, rendezvousRelayFeedbackFromHeartbeat(heartbeats[0], now)...)
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func rendezvousRelayFeedbackFromHeartbeat(heartbeat NodeHeartbeat, now time.Time) []rendezvousRelayFeedbackEntry {
|
|
if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) {
|
|
return nil
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
} else {
|
|
now = now.UTC()
|
|
}
|
|
if heartbeat.ObservedAt.IsZero() ||
|
|
heartbeat.ObservedAt.After(now.Add(time.Minute)) ||
|
|
now.Sub(heartbeat.ObservedAt.UTC()) > rendezvousRelayFeedbackMaxAge {
|
|
return nil
|
|
}
|
|
var metadata struct {
|
|
MeshRendezvousLeaseReport heartbeatRendezvousLeaseReport `json:"mesh_rendezvous_lease_report"`
|
|
}
|
|
if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil {
|
|
return nil
|
|
}
|
|
report := metadata.MeshRendezvousLeaseReport
|
|
if report.NodeID != "" && report.NodeID != heartbeat.NodeID {
|
|
return nil
|
|
}
|
|
if report.ClusterID != "" && report.ClusterID != heartbeat.ClusterID {
|
|
return nil
|
|
}
|
|
out := []rendezvousRelayFeedbackEntry{}
|
|
for _, lease := range report.Leases {
|
|
if !lease.StaleRelay && !lease.WithdrawalNeeded && !lease.ReselectionNeeded {
|
|
continue
|
|
}
|
|
if strings.TrimSpace(lease.PeerNodeID) == "" || strings.TrimSpace(lease.RelayNodeID) == "" {
|
|
continue
|
|
}
|
|
out = append(out, rendezvousRelayFeedbackEntry{
|
|
ReporterNodeID: heartbeat.NodeID,
|
|
RouteIDs: append([]string{}, lease.RouteIDs...),
|
|
LeaseID: strings.TrimSpace(lease.LeaseID),
|
|
PeerNodeID: strings.TrimSpace(lease.PeerNodeID),
|
|
RelayNodeID: strings.TrimSpace(lease.RelayNodeID),
|
|
ConnectionState: strings.TrimSpace(lease.ConnectionState),
|
|
Reason: strings.TrimSpace(lease.Reason),
|
|
WithdrawalNeeded: lease.WithdrawalNeeded,
|
|
ReselectionNeeded: lease.ReselectionNeeded,
|
|
ObservedAt: heartbeat.ObservedAt.UTC(),
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func (s *Service) rendezvousRelayReplacementHints(ctx context.Context, clusterID string, routePath []string, now time.Time) ([]RendezvousRelayPolicyDecision, error) {
|
|
out := []RendezvousRelayPolicyDecision{}
|
|
seenNodes := map[string]struct{}{}
|
|
for _, nodeID := range routePath {
|
|
nodeID = strings.TrimSpace(nodeID)
|
|
if nodeID == "" {
|
|
continue
|
|
}
|
|
if _, duplicate := seenNodes[nodeID]; duplicate {
|
|
continue
|
|
}
|
|
seenNodes[nodeID] = struct{}{}
|
|
heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 1)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if len(heartbeats) == 0 {
|
|
continue
|
|
}
|
|
out = append(out, rendezvousRelayReplacementHintsFromHeartbeat(heartbeats[0], now)...)
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func rendezvousRelayReplacementHintsFromHeartbeat(heartbeat NodeHeartbeat, now time.Time) []RendezvousRelayPolicyDecision {
|
|
if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) {
|
|
return nil
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
} else {
|
|
now = now.UTC()
|
|
}
|
|
if heartbeat.ObservedAt.IsZero() ||
|
|
heartbeat.ObservedAt.After(now.Add(time.Minute)) ||
|
|
now.Sub(heartbeat.ObservedAt.UTC()) > rendezvousRelayFeedbackMaxAge {
|
|
return nil
|
|
}
|
|
var metadata struct {
|
|
MeshRoutePathDecisionReport struct {
|
|
ClusterID string `json:"cluster_id"`
|
|
NodeID string `json:"node_id"`
|
|
Decisions []RoutePathDecision `json:"decisions"`
|
|
} `json:"mesh_route_path_decision_report"`
|
|
}
|
|
if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil {
|
|
return nil
|
|
}
|
|
report := metadata.MeshRoutePathDecisionReport
|
|
if report.NodeID != "" && report.NodeID != heartbeat.NodeID {
|
|
return nil
|
|
}
|
|
if report.ClusterID != "" && report.ClusterID != heartbeat.ClusterID {
|
|
return nil
|
|
}
|
|
out := []RendezvousRelayPolicyDecision{}
|
|
for _, decision := range report.Decisions {
|
|
if strings.TrimSpace(decision.RouteID) == "" ||
|
|
decision.DecisionSource != "stale_relay_replacement" ||
|
|
strings.TrimSpace(decision.SelectedRelayID) == "" ||
|
|
strings.TrimSpace(decision.StaleRelayNodeID) == "" ||
|
|
decision.ProductionForwarding ||
|
|
!decision.ControlPlaneOnly ||
|
|
(!decision.ExpiresAt.IsZero() && !decision.ExpiresAt.After(now)) {
|
|
continue
|
|
}
|
|
peerNodeID := strings.TrimSpace(decision.RendezvousPeerNodeID)
|
|
if peerNodeID == "" {
|
|
peerNodeID = replacementPeerNodeIDFromDecision(decision)
|
|
}
|
|
if peerNodeID == "" {
|
|
continue
|
|
}
|
|
out = append(out, RendezvousRelayPolicyDecision{
|
|
RouteID: strings.TrimSpace(decision.RouteID),
|
|
PeerNodeID: peerNodeID,
|
|
StaleRelayNodeID: strings.TrimSpace(decision.StaleRelayNodeID),
|
|
SelectedRelayID: strings.TrimSpace(decision.SelectedRelayID),
|
|
SelectedEndpoint: strings.TrimRight(strings.TrimSpace(decision.SelectedRelayEndpoint), "/"),
|
|
Score: decision.PathScore,
|
|
Reason: "stale_relay_replacement",
|
|
ScoreReasons: append([]string{}, decision.ScoreReasons...),
|
|
ReporterNodeID: heartbeat.NodeID,
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func replacementPeerNodeIDFromDecision(decision RoutePathDecision) string {
|
|
effectiveHops := cleanRouteNodePath(decision.EffectiveHops)
|
|
selectedRelayID := strings.TrimSpace(decision.SelectedRelayID)
|
|
for index, nodeID := range effectiveHops {
|
|
if nodeID == selectedRelayID && index+1 < len(effectiveHops) {
|
|
return effectiveHops[index+1]
|
|
}
|
|
}
|
|
return strings.TrimSpace(decision.DestinationNodeID)
|
|
}
|
|
|
|
func replacementHintFeedback(hints []RendezvousRelayPolicyDecision, now time.Time) []rendezvousRelayFeedbackEntry {
|
|
if len(hints) == 0 {
|
|
return nil
|
|
}
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
} else {
|
|
now = now.UTC()
|
|
}
|
|
out := make([]rendezvousRelayFeedbackEntry, 0, len(hints))
|
|
for _, hint := range hints {
|
|
if strings.TrimSpace(hint.RouteID) == "" ||
|
|
strings.TrimSpace(hint.PeerNodeID) == "" ||
|
|
strings.TrimSpace(hint.StaleRelayNodeID) == "" ||
|
|
strings.TrimSpace(hint.SelectedRelayID) == "" {
|
|
continue
|
|
}
|
|
out = append(out, rendezvousRelayFeedbackEntry{
|
|
ReporterNodeID: strings.TrimSpace(hint.ReporterNodeID),
|
|
RouteIDs: []string{strings.TrimSpace(hint.RouteID)},
|
|
PeerNodeID: strings.TrimSpace(hint.PeerNodeID),
|
|
RelayNodeID: strings.TrimSpace(hint.StaleRelayNodeID),
|
|
ConnectionState: "replacement_hint",
|
|
Reason: "stale_relay_replacement_hint",
|
|
WithdrawalNeeded: true,
|
|
ReselectionNeeded: true,
|
|
ObservedAt: now,
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func rendezvousRelayRouteHealthFeedback(localNodeID string, route SyntheticMeshRouteConfig, links []MeshLinkObservation, now time.Time) []rendezvousRelayFeedbackEntry {
|
|
out := []rendezvousRelayFeedbackEntry{}
|
|
for _, link := range links {
|
|
item, ok := rendezvousRelayRouteHealthFeedbackFromLink(localNodeID, route, link, now)
|
|
if ok {
|
|
out = append(out, item)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func rendezvousRelayRouteHealthFeedbackFromLink(localNodeID string, route SyntheticMeshRouteConfig, link MeshLinkObservation, now time.Time) (rendezvousRelayFeedbackEntry, bool) {
|
|
localNodeID = strings.TrimSpace(localNodeID)
|
|
if localNodeID == "" || link.SourceNodeID != localNodeID || strings.TrimSpace(route.RouteID) == "" {
|
|
return rendezvousRelayFeedbackEntry{}, false
|
|
}
|
|
if !meshLinkObservationFresh(link, now) {
|
|
return rendezvousRelayFeedbackEntry{}, false
|
|
}
|
|
metadata, ok := routeHealthMetadataFromLink(link)
|
|
if !ok ||
|
|
metadata.ObservationType != "synthetic_route_health" ||
|
|
strings.TrimSpace(metadata.RouteID) != route.RouteID ||
|
|
!metadata.RoutePathDecisionApplied ||
|
|
metadata.ProductionForwarding ||
|
|
metadata.ProductionPayloadForwarding ||
|
|
metadata.RouteHealthProductionPayloadForwarding ||
|
|
metadata.RouteHealthServicePayloadForwarding {
|
|
return rendezvousRelayFeedbackEntry{}, false
|
|
}
|
|
selectedRelayID := strings.TrimSpace(metadata.RoutePathDecisionSelectedRelayID)
|
|
if selectedRelayID == "" {
|
|
return rendezvousRelayFeedbackEntry{}, false
|
|
}
|
|
reason := ""
|
|
switch {
|
|
case metadata.RoutePathDriftDetected:
|
|
reason = "synthetic_route_health_drift"
|
|
case link.LinkStatus == "unreachable":
|
|
reason = "synthetic_route_health_unreachable"
|
|
case strings.TrimSpace(metadata.FailureReason) != "":
|
|
reason = "synthetic_route_health_failure"
|
|
default:
|
|
return rendezvousRelayFeedbackEntry{}, false
|
|
}
|
|
peerNodeID := routeHealthPeerNodeID(metadata, route, link.TargetNodeID)
|
|
if peerNodeID == "" {
|
|
return rendezvousRelayFeedbackEntry{}, false
|
|
}
|
|
return rendezvousRelayFeedbackEntry{
|
|
ReporterNodeID: link.SourceNodeID,
|
|
RouteIDs: []string{route.RouteID},
|
|
LeaseID: strings.TrimSpace(metadata.RoutePathDecisionRendezvousLeaseID),
|
|
PeerNodeID: peerNodeID,
|
|
RelayNodeID: selectedRelayID,
|
|
ConnectionState: reason,
|
|
Reason: reason,
|
|
WithdrawalNeeded: true,
|
|
ReselectionNeeded: true,
|
|
ObservedAt: link.ObservedAt.UTC(),
|
|
}, true
|
|
}
|
|
|
|
func routeHealthMetadataFromLink(link MeshLinkObservation) (meshRouteHealthObservationMetadata, bool) {
|
|
if len(link.Metadata) == 0 || !json.Valid(link.Metadata) {
|
|
return meshRouteHealthObservationMetadata{}, false
|
|
}
|
|
var metadata meshRouteHealthObservationMetadata
|
|
if err := json.Unmarshal(link.Metadata, &metadata); err != nil {
|
|
return meshRouteHealthObservationMetadata{}, false
|
|
}
|
|
return metadata, true
|
|
}
|
|
|
|
func meshLinkObservationFresh(link MeshLinkObservation, now time.Time) bool {
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
} else {
|
|
now = now.UTC()
|
|
}
|
|
return !link.ObservedAt.IsZero() &&
|
|
!link.ObservedAt.After(now.Add(time.Minute)) &&
|
|
now.Sub(link.ObservedAt.UTC()) <= rendezvousRelayFeedbackMaxAge
|
|
}
|
|
|
|
func routeHealthPeerNodeID(metadata meshRouteHealthObservationMetadata, route SyntheticMeshRouteConfig, targetNodeID string) string {
|
|
if peerNodeID := strings.TrimSpace(metadata.RoutePathDecisionRendezvousPeerNodeID); peerNodeID != "" {
|
|
return peerNodeID
|
|
}
|
|
selectedRelayID := strings.TrimSpace(metadata.RoutePathDecisionSelectedRelayID)
|
|
if peerNodeID := nodeAfterInPath(cleanRouteNodePath(metadata.ExpectedEffectiveHops), selectedRelayID); peerNodeID != "" {
|
|
return peerNodeID
|
|
}
|
|
if peerNodeID := nodeAfterInPath(cleanRouteNodePath(route.Hops), selectedRelayID); peerNodeID != "" {
|
|
return peerNodeID
|
|
}
|
|
if targetNodeID = strings.TrimSpace(targetNodeID); targetNodeID != "" {
|
|
return targetNodeID
|
|
}
|
|
return strings.TrimSpace(route.DestinationNodeID)
|
|
}
|
|
|
|
func nodeAfterInPath(path []string, nodeID string) string {
|
|
nodeID = strings.TrimSpace(nodeID)
|
|
if nodeID == "" {
|
|
return ""
|
|
}
|
|
for index, item := range path {
|
|
if item == nodeID && index+1 < len(path) {
|
|
return path[index+1]
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func newRendezvousRelayPolicy(localNodeID string, links []MeshLinkObservation, now time.Time) *rendezvousRelayPolicy {
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
} else {
|
|
now = now.UTC()
|
|
}
|
|
return &rendezvousRelayPolicy{
|
|
localNodeID: strings.TrimSpace(localNodeID),
|
|
now: now,
|
|
links: append([]MeshLinkObservation{}, links...),
|
|
withdrawn: map[string]RendezvousRelayPolicyDecision{},
|
|
replacements: map[string]RendezvousRelayPolicyDecision{},
|
|
}
|
|
}
|
|
|
|
func (p *rendezvousRelayPolicy) addFeedback(items []rendezvousRelayFeedbackEntry) {
|
|
if p == nil {
|
|
return
|
|
}
|
|
p.feedback = append(p.feedback, items...)
|
|
}
|
|
|
|
func (p *rendezvousRelayPolicy) staleForLease(routeID string, lease PeerRendezvousLease) (rendezvousRelayFeedbackEntry, bool) {
|
|
if p == nil {
|
|
return rendezvousRelayFeedbackEntry{}, false
|
|
}
|
|
for _, item := range p.feedback {
|
|
if !rendezvousFeedbackAppliesToRoute(item, routeID) {
|
|
continue
|
|
}
|
|
if item.LeaseID != "" && lease.LeaseID != "" && item.LeaseID == lease.LeaseID {
|
|
return item, true
|
|
}
|
|
if item.PeerNodeID == lease.PeerNodeID && item.RelayNodeID == lease.RelayNodeID {
|
|
return item, true
|
|
}
|
|
}
|
|
return rendezvousRelayFeedbackEntry{}, false
|
|
}
|
|
|
|
func (p *rendezvousRelayPolicy) relayStale(routeID string, peerNodeID string, relayNodeID string) (rendezvousRelayFeedbackEntry, bool) {
|
|
if p == nil {
|
|
return rendezvousRelayFeedbackEntry{}, false
|
|
}
|
|
for _, item := range p.feedback {
|
|
if item.PeerNodeID == peerNodeID &&
|
|
item.RelayNodeID == relayNodeID &&
|
|
rendezvousFeedbackAppliesToRoute(item, routeID) {
|
|
return item, true
|
|
}
|
|
}
|
|
return rendezvousRelayFeedbackEntry{}, false
|
|
}
|
|
|
|
func (p *rendezvousRelayPolicy) hasStalePeer(routeID string, peerNodeID string) (rendezvousRelayFeedbackEntry, bool) {
|
|
if p == nil {
|
|
return rendezvousRelayFeedbackEntry{}, false
|
|
}
|
|
for _, item := range p.feedback {
|
|
if item.PeerNodeID == peerNodeID && rendezvousFeedbackAppliesToRoute(item, routeID) {
|
|
return item, true
|
|
}
|
|
}
|
|
return rendezvousRelayFeedbackEntry{}, false
|
|
}
|
|
|
|
func (p *rendezvousRelayPolicy) recordWithdrawal(route SyntheticMeshRouteConfig, lease PeerRendezvousLease, feedback rendezvousRelayFeedbackEntry) {
|
|
if p == nil {
|
|
return
|
|
}
|
|
key := route.RouteID + "\x00" + lease.LeaseID + "\x00" + lease.RelayNodeID
|
|
p.withdrawn[key] = RendezvousRelayPolicyDecision{
|
|
RouteID: route.RouteID,
|
|
PeerNodeID: lease.PeerNodeID,
|
|
WithdrawnLeaseID: lease.LeaseID,
|
|
StaleRelayNodeID: lease.RelayNodeID,
|
|
Reason: "stale_relay_withdrawn",
|
|
ReporterNodeID: feedback.ReporterNodeID,
|
|
}
|
|
}
|
|
|
|
func (p *rendezvousRelayPolicy) recordReplacement(route SyntheticMeshRouteConfig, peerNodeID string, feedback rendezvousRelayFeedbackEntry, selection rendezvousRelaySelection) {
|
|
if p == nil || selection.RelayNodeID == "" {
|
|
return
|
|
}
|
|
key := rendezvousRelayReplacementKey(route.RouteID, peerNodeID, feedback.RelayNodeID, selection.RelayNodeID)
|
|
p.replacements[key] = RendezvousRelayPolicyDecision{
|
|
RouteID: route.RouteID,
|
|
PeerNodeID: peerNodeID,
|
|
StaleRelayNodeID: feedback.RelayNodeID,
|
|
SelectedRelayID: selection.RelayNodeID,
|
|
SelectedEndpoint: selection.Endpoint,
|
|
Score: selection.Score,
|
|
Reason: "stale_relay_replacement",
|
|
ScoreReasons: append([]string{}, selection.Reasons...),
|
|
ReporterNodeID: feedback.ReporterNodeID,
|
|
}
|
|
}
|
|
|
|
func (p *rendezvousRelayPolicy) addReplacementHints(hints []RendezvousRelayPolicyDecision) {
|
|
if p == nil {
|
|
return
|
|
}
|
|
for _, hint := range hints {
|
|
hint.RouteID = strings.TrimSpace(hint.RouteID)
|
|
hint.PeerNodeID = strings.TrimSpace(hint.PeerNodeID)
|
|
hint.StaleRelayNodeID = strings.TrimSpace(hint.StaleRelayNodeID)
|
|
hint.SelectedRelayID = strings.TrimSpace(hint.SelectedRelayID)
|
|
hint.SelectedEndpoint = strings.TrimRight(strings.TrimSpace(hint.SelectedEndpoint), "/")
|
|
if hint.RouteID == "" || hint.PeerNodeID == "" || hint.StaleRelayNodeID == "" || hint.SelectedRelayID == "" {
|
|
continue
|
|
}
|
|
if hint.Reason == "" {
|
|
hint.Reason = "stale_relay_replacement"
|
|
}
|
|
if len(hint.ScoreReasons) == 0 {
|
|
hint.ScoreReasons = []string{"route_path_decision_hint"}
|
|
}
|
|
key := rendezvousRelayReplacementKey(hint.RouteID, hint.PeerNodeID, hint.StaleRelayNodeID, hint.SelectedRelayID)
|
|
existing, exists := p.replacements[key]
|
|
if !exists || hint.Score > existing.Score {
|
|
p.replacements[key] = hint
|
|
}
|
|
}
|
|
}
|
|
|
|
func (p *rendezvousRelayPolicy) report() *RendezvousRelayPolicyReport {
|
|
if p == nil || (len(p.feedback) == 0 && len(p.withdrawn) == 0 && len(p.replacements) == 0) {
|
|
return nil
|
|
}
|
|
decisions := make([]RendezvousRelayPolicyDecision, 0, len(p.withdrawn)+len(p.replacements))
|
|
for _, decision := range p.withdrawn {
|
|
decisions = append(decisions, decision)
|
|
}
|
|
for _, decision := range p.replacements {
|
|
decisions = append(decisions, decision)
|
|
}
|
|
sort.SliceStable(decisions, func(i, j int) bool {
|
|
if decisions[i].RouteID != decisions[j].RouteID {
|
|
return decisions[i].RouteID < decisions[j].RouteID
|
|
}
|
|
if decisions[i].PeerNodeID != decisions[j].PeerNodeID {
|
|
return decisions[i].PeerNodeID < decisions[j].PeerNodeID
|
|
}
|
|
if decisions[i].Reason != decisions[j].Reason {
|
|
return decisions[i].Reason < decisions[j].Reason
|
|
}
|
|
return decisions[i].SelectedRelayID < decisions[j].SelectedRelayID
|
|
})
|
|
return &RendezvousRelayPolicyReport{
|
|
SchemaVersion: "c17z15.rendezvous_relay_policy.v1",
|
|
ScoringMode: "route_adjacency_endpoint_priority_mesh_link_health_synthetic_route_health_feedback",
|
|
FeedbackMaxAgeSeconds: int(rendezvousRelayFeedbackMaxAge / time.Second),
|
|
StaleRelayCount: len(p.feedback),
|
|
WithdrawnLeaseCount: len(p.withdrawn),
|
|
ReplacementLeaseCount: len(p.replacements),
|
|
Decisions: decisions,
|
|
}
|
|
}
|
|
|
|
func (p *rendezvousRelayPolicy) replacementDecision(routeID string, peerNodeID string, selectedRelayID string) (RendezvousRelayPolicyDecision, bool) {
|
|
if p == nil {
|
|
return RendezvousRelayPolicyDecision{}, false
|
|
}
|
|
for _, decision := range p.replacements {
|
|
if decision.RouteID == routeID &&
|
|
decision.PeerNodeID == peerNodeID &&
|
|
decision.SelectedRelayID == selectedRelayID {
|
|
return decision, true
|
|
}
|
|
}
|
|
return RendezvousRelayPolicyDecision{}, false
|
|
}
|
|
|
|
func rendezvousRelayReplacementKey(routeID string, peerNodeID string, staleRelayNodeID string, selectedRelayID string) string {
|
|
return strings.TrimSpace(routeID) + "\x00" +
|
|
strings.TrimSpace(peerNodeID) + "\x00" +
|
|
strings.TrimSpace(staleRelayNodeID) + "\x00" +
|
|
strings.TrimSpace(selectedRelayID)
|
|
}
|
|
|
|
func routePathDecisionReport(generation string, decisions []RoutePathDecision) *RoutePathDecisionReport {
|
|
return routePathDecisionReportWithRecoveryPolicy(generation, decisions, defaultFabricServiceChannelRecoveryPolicy())
|
|
}
|
|
|
|
func routePathDecisionReportWithRecoveryPolicy(generation string, decisions []RoutePathDecision, policy FabricServiceChannelRecoveryPolicy) *RoutePathDecisionReport {
|
|
if len(decisions) == 0 {
|
|
return nil
|
|
}
|
|
policy = normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy())
|
|
out := append([]RoutePathDecision{}, decisions...)
|
|
sort.SliceStable(out, func(i, j int) bool {
|
|
if out[i].RouteID != out[j].RouteID {
|
|
return out[i].RouteID < out[j].RouteID
|
|
}
|
|
return out[i].DecisionID < out[j].DecisionID
|
|
})
|
|
replacements := 0
|
|
degraded := 0
|
|
rebuildRequests := 0
|
|
rebuildApplied := 0
|
|
recoveryHysteresis := 0
|
|
recoveryPromoted := 0
|
|
recoveryDemoted := 0
|
|
for _, decision := range out {
|
|
if decision.DecisionSource == "stale_relay_replacement" ||
|
|
decision.DecisionSource == "service_channel_feedback_replacement" ||
|
|
decision.DecisionSource == "service_channel_feedback_exit_pool_replacement" ||
|
|
decision.DecisionSource == "service_channel_feedback_entry_pool_replacement" ||
|
|
decision.DecisionSource == "service_channel_feedback_entry_exit_pool_replacement" ||
|
|
(decision.DecisionSource == "service_channel_remediation_command" && strings.TrimSpace(decision.ReplacementRouteID) != "") {
|
|
replacements++
|
|
}
|
|
if containsString(decision.ScoreReasons, "service_channel_recovery_hysteresis") {
|
|
recoveryHysteresis++
|
|
}
|
|
if containsString(decision.ScoreReasons, "service_channel_recovery_promoted") {
|
|
recoveryPromoted++
|
|
}
|
|
if containsString(decision.ScoreReasons, "service_channel_recovery_demoted") {
|
|
recoveryDemoted++
|
|
}
|
|
if decision.DecisionSource == "service_channel_feedback_no_alternate" || decision.RebuildStatus == "no_alternate" {
|
|
degraded++
|
|
}
|
|
switch decision.RebuildStatus {
|
|
case "requested", "pending_degraded_fallback", "no_alternate", "deferred_by_policy", "expired":
|
|
rebuildRequests++
|
|
case "applied":
|
|
rebuildRequests++
|
|
rebuildApplied++
|
|
}
|
|
}
|
|
return &RoutePathDecisionReport{
|
|
SchemaVersion: "c17z18.route_path_decisions.v1",
|
|
DecisionMode: "control_plane_effective_path_from_relay_policy_and_service_channel_feedback",
|
|
Generation: generation,
|
|
RecoveryPolicy: fabricServiceChannelRecoveryPolicyRef(policy),
|
|
DecisionCount: len(out),
|
|
ReplacementDecisionCount: replacements,
|
|
DegradedDecisionCount: degraded,
|
|
RebuildRequestCount: rebuildRequests,
|
|
RebuildAppliedCount: rebuildApplied,
|
|
RecoveryHysteresisCount: recoveryHysteresis,
|
|
RecoveryPromotedCount: recoveryPromoted,
|
|
RecoveryDemotedCount: recoveryDemoted,
|
|
ControlPlaneOnly: true,
|
|
ProductionForwarding: false,
|
|
Decisions: out,
|
|
}
|
|
}
|
|
|
|
func serviceChannelFeedbackRequestsRebuild(item fabricServiceChannelRouteFeedback) bool {
|
|
if item.RouteID == "" || !item.Fenced || item.ManualRetry {
|
|
return false
|
|
}
|
|
return item.RouteRebuildRecommended ||
|
|
item.DegradedFallbackRecommended ||
|
|
item.ConsecutiveFailures >= 2 ||
|
|
containsString(item.Reasons, "service_channel_route_rebuild_recommended")
|
|
}
|
|
|
|
func serviceChannelRebuildRequestID(routeID, reporterNodeID, generation string) string {
|
|
base := strings.TrimSpace(routeID)
|
|
if base == "" {
|
|
base = "route"
|
|
}
|
|
if strings.TrimSpace(reporterNodeID) != "" {
|
|
base += "-" + strings.TrimSpace(reporterNodeID)
|
|
}
|
|
if strings.TrimSpace(generation) != "" {
|
|
base += "-" + strings.TrimSpace(generation)
|
|
}
|
|
return base + "-rebuild"
|
|
}
|
|
|
|
func (s *Service) serviceChannelRouteReplacementDecision(input GetNodeSyntheticMeshConfigInput, fencedRoute SyntheticMeshRouteConfig, intents []MeshRouteIntent, feedback map[string]fabricServiceChannelRouteFeedback, generation string) RoutePathDecision {
|
|
routeFeedback := feedback[fencedRoute.RouteID]
|
|
decision := RoutePathDecision{
|
|
DecisionID: fencedRoute.RouteID + "-path-" + input.NodeID + "-service-channel-feedback",
|
|
RouteID: fencedRoute.RouteID,
|
|
ClusterID: fencedRoute.ClusterID,
|
|
LocalNodeID: input.NodeID,
|
|
SourceNodeID: fencedRoute.SourceNodeID,
|
|
DestinationNodeID: fencedRoute.DestinationNodeID,
|
|
OriginalHops: append([]string{}, fencedRoute.Hops...),
|
|
EffectiveHops: []string{},
|
|
DecisionSource: "service_channel_feedback_no_alternate",
|
|
Generation: generation,
|
|
PathScore: 0,
|
|
ScoreReasons: []string{"service_channel_fenced_route", "no_unfenced_alternate_route"},
|
|
ControlPlaneOnly: true,
|
|
ProductionForwarding: false,
|
|
ExpiresAt: fencedRoute.ExpiresAt.UTC(),
|
|
}
|
|
applyServiceChannelFeedbackCorrelationToDecision(&decision, routeFeedback)
|
|
if serviceChannelFeedbackRequestsRebuild(routeFeedback) {
|
|
decision.RebuildRequestID = serviceChannelRebuildRequestID(fencedRoute.RouteID, input.NodeID, generation)
|
|
decision.RebuildStatus = "pending_degraded_fallback"
|
|
decision.RebuildReason = "service_channel_feedback_rebuild_requested"
|
|
decision.RebuildAttempt = routeFeedback.ConsecutiveFailures
|
|
decision.ScoreReasons = append(decision.ScoreReasons, "service_channel_rebuild_requested", "backend_relay_degraded_fallback_until_rebuild")
|
|
if routeFeedback.DegradedFallbackRecommended {
|
|
decision.ScoreReasons = append(decision.ScoreReasons, "service_channel_degraded_fallback_recommended")
|
|
}
|
|
}
|
|
replacement, replacementFeedback, ok := s.selectServiceChannelRouteReplacement(input, fencedRoute, intents, feedback)
|
|
if ok {
|
|
decision.ReplacementRouteID = replacement.RouteID
|
|
decision.EffectiveHops = append([]string{}, replacement.Hops...)
|
|
decision.DecisionSource = "service_channel_feedback_replacement"
|
|
decision.PathScore = serviceChannelReplacementRouteScore(replacement)
|
|
decision.ScoreReasons = []string{"service_channel_fenced_route", "selected_unfenced_alternate_route"}
|
|
if replacement.SourceNodeID != fencedRoute.SourceNodeID {
|
|
decision.DecisionSource = "service_channel_feedback_entry_pool_replacement"
|
|
decision.ScoreReasons = append(decision.ScoreReasons, "selected_unfenced_entry_pool_route")
|
|
}
|
|
if replacement.DestinationNodeID != fencedRoute.DestinationNodeID {
|
|
decision.DecisionSource = "service_channel_feedback_exit_pool_replacement"
|
|
decision.ScoreReasons = append(decision.ScoreReasons, "selected_unfenced_exit_pool_route")
|
|
}
|
|
if replacement.SourceNodeID != fencedRoute.SourceNodeID && replacement.DestinationNodeID != fencedRoute.DestinationNodeID {
|
|
decision.DecisionSource = "service_channel_feedback_entry_exit_pool_replacement"
|
|
decision.ScoreReasons = append(decision.ScoreReasons, "selected_unfenced_entry_exit_pool_route")
|
|
}
|
|
if decision.RebuildRequestID != "" {
|
|
decision.RebuildStatus = "applied"
|
|
decision.RebuildReason = "service_channel_feedback_rebuild_applied_to_alternate"
|
|
decision.ScoreReasons = append(decision.ScoreReasons, "service_channel_rebuild_applied")
|
|
}
|
|
if replacementFeedback.RouteID != "" && !replacementFeedback.Fenced {
|
|
decision.PathScore += 10000
|
|
decision.ScoreReasons = append(decision.ScoreReasons, "active_healthy_feedback_dampening_window")
|
|
decision.ScoreReasons = append(decision.ScoreReasons, replacementFeedback.Reasons...)
|
|
}
|
|
decision.ScoreReasons = dedupeStrings(decision.ScoreReasons)
|
|
if replacement.ExpiresAt.Before(decision.ExpiresAt) {
|
|
decision.ExpiresAt = replacement.ExpiresAt.UTC()
|
|
}
|
|
}
|
|
decision.PreviousHopID, decision.NextHopID, decision.LocalRole = routePathLocalPosition(decision.EffectiveHops, input.NodeID, "", "")
|
|
return decision
|
|
}
|
|
|
|
func applyServiceChannelFeedbackCorrelationToDecision(decision *RoutePathDecision, feedback fabricServiceChannelRouteFeedback) {
|
|
if decision == nil || feedback.RouteID == "" {
|
|
return
|
|
}
|
|
decision.FeedbackObservationID = feedback.ObservationID
|
|
decision.FeedbackSource = feedback.Source
|
|
if !feedback.ObservedAt.IsZero() {
|
|
observedAt := feedback.ObservedAt.UTC()
|
|
decision.FeedbackObservedAt = &observedAt
|
|
}
|
|
if !feedback.ExpiresAt.IsZero() {
|
|
expiresAt := feedback.ExpiresAt.UTC()
|
|
decision.FeedbackExpiresAt = &expiresAt
|
|
}
|
|
decision.FeedbackChannelID = feedback.ChannelID
|
|
decision.FeedbackResourceID = feedback.ResourceID
|
|
decision.FeedbackViolationStatus = feedback.ViolationStatus
|
|
decision.FeedbackViolationReason = feedback.ViolationReason
|
|
}
|
|
|
|
func (s *Service) selectServiceChannelRouteReplacement(input GetNodeSyntheticMeshConfigInput, fencedRoute SyntheticMeshRouteConfig, intents []MeshRouteIntent, feedback map[string]fabricServiceChannelRouteFeedback) (SyntheticMeshRouteConfig, fabricServiceChannelRouteFeedback, bool) {
|
|
var selected SyntheticMeshRouteConfig
|
|
var selectedFeedback fabricServiceChannelRouteFeedback
|
|
selectedScore := -1
|
|
scopes := fabricServiceChannelRouteIntentReplacementScopes(intents)
|
|
for _, intent := range intents {
|
|
route, _, _, _, _, ok := s.syntheticRouteFromIntent(input, intent)
|
|
if !ok || route.RouteID == fencedRoute.RouteID {
|
|
continue
|
|
}
|
|
if !fabricServiceChannelRoutesShareReplacementScope(fencedRoute, route, scopes) {
|
|
continue
|
|
}
|
|
if !fabricChannelsIntersect(route.AllowedChannels, fencedRoute.AllowedChannels) {
|
|
continue
|
|
}
|
|
if item, ok := feedback[route.RouteID]; ok && item.Fenced {
|
|
continue
|
|
}
|
|
routeFeedback := feedback[route.RouteID]
|
|
score := serviceChannelReplacementRouteScore(route) + intent.Priority
|
|
if routeFeedback.RouteID != "" {
|
|
score += 10000
|
|
}
|
|
if route.DestinationNodeID != fencedRoute.DestinationNodeID {
|
|
score -= 5
|
|
}
|
|
if route.SourceNodeID != fencedRoute.SourceNodeID {
|
|
score -= 10
|
|
}
|
|
if score > selectedScore || (score == selectedScore && route.RouteID < selected.RouteID) {
|
|
selected = route
|
|
selectedFeedback = routeFeedback
|
|
selectedScore = score
|
|
}
|
|
}
|
|
return selected, selectedFeedback, selected.RouteID != ""
|
|
}
|
|
|
|
func serviceChannelReplacementRouteScore(route SyntheticMeshRouteConfig) int {
|
|
score := 1000 - len(route.Hops)*10
|
|
if score < 1 {
|
|
return 1
|
|
}
|
|
return score
|
|
}
|
|
|
|
func routePathDecisionForRoute(route SyntheticMeshRouteConfig, localNodeID string, leases []PeerRendezvousLease, relayPolicy *rendezvousRelayPolicy, generation string, serviceFeedback fabricServiceChannelRouteFeedback) RoutePathDecision {
|
|
decision := RoutePathDecision{
|
|
DecisionID: route.RouteID + "-path-" + localNodeID,
|
|
RouteID: route.RouteID,
|
|
ClusterID: route.ClusterID,
|
|
LocalNodeID: localNodeID,
|
|
SourceNodeID: route.SourceNodeID,
|
|
DestinationNodeID: route.DestinationNodeID,
|
|
OriginalHops: append([]string{}, route.Hops...),
|
|
EffectiveHops: append([]string{}, route.Hops...),
|
|
DecisionSource: "route_intent",
|
|
Generation: generation,
|
|
PathScore: 1000,
|
|
ScoreReasons: []string{"route_intent_hops"},
|
|
ControlPlaneOnly: true,
|
|
ProductionForwarding: false,
|
|
ExpiresAt: route.ExpiresAt.UTC(),
|
|
}
|
|
if serviceFeedback.ManualRetry {
|
|
decision.ScoreReasons = append(decision.ScoreReasons, "service_channel_route_retry_after_operator_expire")
|
|
decision.ScoreReasons = append(decision.ScoreReasons, serviceFeedback.Reasons...)
|
|
decision.ScoreReasons = dedupeStrings(decision.ScoreReasons)
|
|
if serviceFeedback.RetryCooldownUntil != nil && serviceFeedback.RetryCooldownUntil.Before(decision.ExpiresAt) {
|
|
decision.ExpiresAt = serviceFeedback.RetryCooldownUntil.UTC()
|
|
}
|
|
}
|
|
var replacementLease PeerRendezvousLease
|
|
var replacementDecision RendezvousRelayPolicyDecision
|
|
replacementFound := false
|
|
for _, lease := range leases {
|
|
if !containsString(lease.RouteIDs, route.RouteID) {
|
|
continue
|
|
}
|
|
relayDecision, ok := relayPolicy.replacementDecision(route.RouteID, lease.PeerNodeID, lease.RelayNodeID)
|
|
if !ok && lease.Reason != "stale_relay_replacement" {
|
|
continue
|
|
}
|
|
if !ok {
|
|
relayDecision = RendezvousRelayPolicyDecision{
|
|
RouteID: route.RouteID,
|
|
PeerNodeID: lease.PeerNodeID,
|
|
SelectedRelayID: lease.RelayNodeID,
|
|
SelectedEndpoint: lease.RelayEndpoint,
|
|
Reason: "stale_relay_replacement",
|
|
}
|
|
}
|
|
if !replacementFound || relayDecision.Score > replacementDecision.Score {
|
|
replacementFound = true
|
|
replacementLease = lease
|
|
replacementDecision = relayDecision
|
|
}
|
|
}
|
|
if replacementFound {
|
|
decision.DecisionID = route.RouteID + "-path-" + localNodeID + "-via-" + replacementLease.RelayNodeID
|
|
decision.EffectiveHops = effectiveRoutePathWithReplacement(route.Hops, replacementLease.PeerNodeID, replacementDecision.StaleRelayNodeID, replacementLease.RelayNodeID)
|
|
decision.SelectedRelayID = replacementLease.RelayNodeID
|
|
decision.SelectedRelayEndpoint = replacementLease.RelayEndpoint
|
|
decision.StaleRelayNodeID = replacementDecision.StaleRelayNodeID
|
|
decision.RendezvousPeerNodeID = replacementLease.PeerNodeID
|
|
decision.RendezvousLeaseID = replacementLease.LeaseID
|
|
decision.RendezvousLeaseReason = replacementLease.Reason
|
|
decision.DecisionSource = "stale_relay_replacement"
|
|
decision.PathScore = replacementDecision.Score
|
|
if decision.PathScore == 0 {
|
|
decision.PathScore = 1000
|
|
}
|
|
decision.ScoreReasons = append([]string{}, replacementDecision.ScoreReasons...)
|
|
if len(decision.ScoreReasons) == 0 {
|
|
decision.ScoreReasons = []string{"relay_replacement_policy"}
|
|
}
|
|
}
|
|
decision.PreviousHopID, decision.NextHopID, decision.LocalRole = routePathLocalPosition(decision.EffectiveHops, localNodeID, decision.SelectedRelayID, decision.StaleRelayNodeID)
|
|
return decision
|
|
}
|
|
|
|
func effectiveRoutePathWithReplacement(original []string, peerNodeID string, staleRelayNodeID string, selectedRelayID string) []string {
|
|
out := make([]string, 0, len(original)+1)
|
|
for _, nodeID := range original {
|
|
nodeID = strings.TrimSpace(nodeID)
|
|
if nodeID == "" || (staleRelayNodeID != "" && nodeID == staleRelayNodeID) {
|
|
continue
|
|
}
|
|
out = append(out, nodeID)
|
|
}
|
|
if selectedRelayID == "" || containsString(out, selectedRelayID) {
|
|
return out
|
|
}
|
|
peerIndex := -1
|
|
for index, nodeID := range out {
|
|
if nodeID == peerNodeID {
|
|
peerIndex = index
|
|
break
|
|
}
|
|
}
|
|
if peerIndex < 0 {
|
|
return append(out, selectedRelayID)
|
|
}
|
|
out = append(out, "")
|
|
copy(out[peerIndex+1:], out[peerIndex:])
|
|
out[peerIndex] = selectedRelayID
|
|
return out
|
|
}
|
|
|
|
func routePathLocalPosition(path []string, localNodeID string, selectedRelayID string, staleRelayNodeID string) (string, string, string) {
|
|
localIndex := -1
|
|
for index, nodeID := range path {
|
|
if nodeID == localNodeID {
|
|
localIndex = index
|
|
break
|
|
}
|
|
}
|
|
if localIndex < 0 {
|
|
if staleRelayNodeID != "" && localNodeID == staleRelayNodeID {
|
|
return "", "", "withdrawn_relay"
|
|
}
|
|
return "", "", "not_on_effective_path"
|
|
}
|
|
previous := ""
|
|
next := ""
|
|
if localIndex > 0 {
|
|
previous = path[localIndex-1]
|
|
}
|
|
if localIndex < len(path)-1 {
|
|
next = path[localIndex+1]
|
|
}
|
|
role := "transit"
|
|
switch {
|
|
case localIndex == 0:
|
|
role = "entry"
|
|
case localIndex == len(path)-1:
|
|
role = "exit"
|
|
case selectedRelayID != "" && localNodeID == selectedRelayID:
|
|
role = "selected_relay"
|
|
}
|
|
return previous, next, role
|
|
}
|
|
|
|
func rendezvousFeedbackAppliesToRoute(item rendezvousRelayFeedbackEntry, routeID string) bool {
|
|
if strings.TrimSpace(routeID) == "" || len(item.RouteIDs) == 0 {
|
|
return true
|
|
}
|
|
return containsString(item.RouteIDs, routeID)
|
|
}
|
|
|
|
func reachabilityFromConnectivityMode(connectivityMode string) string {
|
|
switch connectivityMode {
|
|
case "outbound_only":
|
|
return "outbound_only"
|
|
case "relay_required":
|
|
return "relay"
|
|
case "private_lan":
|
|
return "private"
|
|
case "direct":
|
|
return "public"
|
|
default:
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
func validatePeerRecoverySeeds(seeds []PeerRecoverySeed) error {
|
|
if len(seeds) > maxScopedRecoverySeeds {
|
|
return ErrInvalidPayload
|
|
}
|
|
seen := map[string]struct{}{}
|
|
for _, seed := range seeds {
|
|
key := strings.TrimSpace(seed.NodeID) + "\x00" + strings.TrimSpace(seed.Endpoint)
|
|
if strings.TrimSpace(seed.NodeID) == "" ||
|
|
strings.TrimSpace(seed.Endpoint) == "" ||
|
|
!isPeerEndpointTransport(seed.Transport) ||
|
|
(seed.ConnectivityMode != "" && !isPeerEndpointConnectivityMode(seed.ConnectivityMode)) ||
|
|
(len(seed.Metadata) > 0 && !json.Valid(seed.Metadata)) {
|
|
return ErrInvalidPayload
|
|
}
|
|
if _, duplicate := seen[key]; duplicate {
|
|
return ErrInvalidPayload
|
|
}
|
|
seen[key] = struct{}{}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func validatePeerRendezvousLeases(leases []PeerRendezvousLease, routePath []string, now time.Time) error {
|
|
if len(leases) > maxScopedRendezvousLeases {
|
|
return ErrInvalidPayload
|
|
}
|
|
now = now.UTC()
|
|
seen := map[string]struct{}{}
|
|
for _, lease := range leases {
|
|
peerNodeID := strings.TrimSpace(lease.PeerNodeID)
|
|
relayNodeID := strings.TrimSpace(lease.RelayNodeID)
|
|
relayEndpoint := strings.TrimSpace(lease.RelayEndpoint)
|
|
transport := strings.TrimSpace(lease.Transport)
|
|
if peerNodeID == "" ||
|
|
relayNodeID == "" ||
|
|
relayEndpoint == "" ||
|
|
peerNodeID == relayNodeID ||
|
|
!containsString(routePath, peerNodeID) ||
|
|
!containsString(routePath, relayNodeID) ||
|
|
(transport != "" && !isPeerRendezvousTransport(transport)) ||
|
|
(!lease.ExpiresAt.IsZero() && !lease.ExpiresAt.After(now)) ||
|
|
(len(lease.Metadata) > 0 && !json.Valid(lease.Metadata)) {
|
|
return ErrInvalidPayload
|
|
}
|
|
if strings.TrimSpace(lease.LeaseID) == "" {
|
|
continue
|
|
}
|
|
if _, duplicate := seen[lease.LeaseID]; duplicate {
|
|
return ErrInvalidPayload
|
|
}
|
|
seen[lease.LeaseID] = struct{}{}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func normalizeRendezvousLeases(leases []PeerRendezvousLease, route SyntheticMeshRouteConfig, now time.Time) []PeerRendezvousLease {
|
|
out := make([]PeerRendezvousLease, 0, len(leases))
|
|
now = now.UTC()
|
|
for _, lease := range leases {
|
|
lease.PeerNodeID = strings.TrimSpace(lease.PeerNodeID)
|
|
lease.RelayNodeID = strings.TrimSpace(lease.RelayNodeID)
|
|
lease.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
|
|
if lease.LeaseID == "" {
|
|
lease.LeaseID = route.RouteID + "-rv-" + lease.PeerNodeID + "-via-" + lease.RelayNodeID
|
|
}
|
|
if lease.Transport == "" {
|
|
lease.Transport = "relay_control"
|
|
}
|
|
if lease.ConnectivityMode == "" {
|
|
lease.ConnectivityMode = "relay_required"
|
|
}
|
|
if lease.Priority <= 0 {
|
|
lease.Priority = 100
|
|
}
|
|
if len(lease.RouteIDs) == 0 {
|
|
lease.RouteIDs = []string{route.RouteID}
|
|
} else if !containsString(lease.RouteIDs, route.RouteID) {
|
|
lease.RouteIDs = append(append([]string{}, lease.RouteIDs...), route.RouteID)
|
|
}
|
|
lease.AllowedChannels = controlPlaneAllowedChannels(firstNonEmptyStringSlice(lease.AllowedChannels, route.AllowedChannels))
|
|
if len(lease.AllowedChannels) == 0 {
|
|
lease.AllowedChannels = []string{"fabric_control", "route_control"}
|
|
}
|
|
lease.ControlPlaneOnly = true
|
|
if lease.IssuedAt.IsZero() {
|
|
lease.IssuedAt = now
|
|
} else {
|
|
lease.IssuedAt = lease.IssuedAt.UTC()
|
|
}
|
|
if lease.ExpiresAt.IsZero() || (!route.ExpiresAt.IsZero() && lease.ExpiresAt.After(route.ExpiresAt)) {
|
|
lease.ExpiresAt = route.ExpiresAt.UTC()
|
|
} else {
|
|
lease.ExpiresAt = lease.ExpiresAt.UTC()
|
|
}
|
|
if lease.Reason == "" {
|
|
lease.Reason = "policy_rendezvous_lease"
|
|
}
|
|
if lease.Metadata == nil {
|
|
lease.Metadata = json.RawMessage(`{}`)
|
|
}
|
|
if !lease.ExpiresAt.IsZero() && lease.ExpiresAt.After(now) {
|
|
out = append(out, lease)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func scopedRendezvousLeases(leases []PeerRendezvousLease, route SyntheticMeshRouteConfig, localNodeID string, relayPolicy *rendezvousRelayPolicy, now time.Time) []PeerRendezvousLease {
|
|
if !containsString(route.Hops, localNodeID) {
|
|
return nil
|
|
}
|
|
normalized := normalizeRendezvousLeases(leases, route, now)
|
|
out := make([]PeerRendezvousLease, 0, len(normalized))
|
|
for _, lease := range normalized {
|
|
if feedback, stale := relayPolicy.staleForLease(route.RouteID, lease); stale {
|
|
relayPolicy.recordWithdrawal(route, lease, feedback)
|
|
continue
|
|
}
|
|
if containsString(route.Hops, lease.PeerNodeID) && containsString(route.Hops, lease.RelayNodeID) {
|
|
out = append(out, lease)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func derivedRendezvousLeases(route SyntheticMeshRouteConfig, peers map[string]string, candidates map[string][]PeerEndpointCandidate, localNodeID string, relayPolicy *rendezvousRelayPolicy, now time.Time) []PeerRendezvousLease {
|
|
if !containsString(route.Hops, localNodeID) {
|
|
return nil
|
|
}
|
|
out := []PeerRendezvousLease{}
|
|
for peerNodeID, items := range candidates {
|
|
peerNodeID = strings.TrimSpace(peerNodeID)
|
|
if peerNodeID == "" || !containsString(route.Hops, peerNodeID) || !peerEndpointCandidatesRequireRendezvous(items) {
|
|
continue
|
|
}
|
|
selection := selectRendezvousRelay(route, peerNodeID, localNodeID, peers, candidates, relayPolicy)
|
|
if selection.RelayNodeID == "" || selection.Endpoint == "" {
|
|
continue
|
|
}
|
|
_, replacement := relayPolicy.hasStalePeer(route.RouteID, peerNodeID)
|
|
reason := rendezvousLeaseReason(items)
|
|
if replacement {
|
|
reason = "stale_relay_replacement"
|
|
}
|
|
lease := PeerRendezvousLease{
|
|
LeaseID: route.RouteID + "-rv-" + peerNodeID + "-via-" + selection.RelayNodeID,
|
|
PeerNodeID: peerNodeID,
|
|
RelayNodeID: selection.RelayNodeID,
|
|
RelayEndpoint: selection.Endpoint,
|
|
Transport: "relay_control",
|
|
ConnectivityMode: "relay_required",
|
|
RouteIDs: []string{route.RouteID},
|
|
AllowedChannels: controlPlaneAllowedChannels(route.AllowedChannels),
|
|
Priority: rendezvousLeasePriority(items),
|
|
ControlPlaneOnly: true,
|
|
IssuedAt: now.UTC(),
|
|
ExpiresAt: route.ExpiresAt.UTC(),
|
|
Reason: reason,
|
|
Metadata: rendezvousRelayLeaseMetadata(selection, replacement),
|
|
}
|
|
if len(lease.AllowedChannels) == 0 {
|
|
lease.AllowedChannels = []string{"fabric_control", "route_control"}
|
|
}
|
|
if lease.Priority <= 0 {
|
|
lease.Priority = 100
|
|
}
|
|
if lease.ExpiresAt.After(now.UTC()) {
|
|
out = append(out, lease)
|
|
if feedback, ok := relayPolicy.hasStalePeer(route.RouteID, peerNodeID); ok && feedback.RelayNodeID != selection.RelayNodeID {
|
|
relayPolicy.recordReplacement(route, peerNodeID, feedback, selection)
|
|
}
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func selectRendezvousRelay(route SyntheticMeshRouteConfig, peerNodeID string, localNodeID string, peers map[string]string, candidates map[string][]PeerEndpointCandidate, relayPolicy *rendezvousRelayPolicy) rendezvousRelaySelection {
|
|
routePath := route.Hops
|
|
peerIndex := -1
|
|
for index, nodeID := range routePath {
|
|
if nodeID == peerNodeID {
|
|
peerIndex = index
|
|
break
|
|
}
|
|
}
|
|
preferred := []string{}
|
|
if peerIndex > 0 {
|
|
preferred = append(preferred, routePath[peerIndex-1])
|
|
}
|
|
if peerIndex >= 0 && peerIndex < len(routePath)-1 {
|
|
preferred = append(preferred, routePath[peerIndex+1])
|
|
}
|
|
preferred = append(preferred, routePath...)
|
|
seen := map[string]struct{}{}
|
|
relayCandidates := []rendezvousRelaySelection{}
|
|
for _, relayNodeID := range preferred {
|
|
relayNodeID = strings.TrimSpace(relayNodeID)
|
|
if relayNodeID == "" || relayNodeID == peerNodeID {
|
|
continue
|
|
}
|
|
if _, duplicate := seen[relayNodeID]; duplicate {
|
|
continue
|
|
}
|
|
seen[relayNodeID] = struct{}{}
|
|
if _, stale := relayPolicy.relayStale(route.RouteID, peerNodeID, relayNodeID); stale {
|
|
continue
|
|
}
|
|
endpoint, endpointScore, endpointReasons := relayControlEndpointForNode(relayNodeID, peers, candidates)
|
|
if endpoint == "" {
|
|
continue
|
|
}
|
|
score, scoreReasons := rendezvousRelayCandidateScore(route.RouteID, routePath, peerIndex, relayNodeID, localNodeID, endpointScore, endpointReasons, relayPolicy)
|
|
relayCandidates = append(relayCandidates, rendezvousRelaySelection{
|
|
RelayNodeID: relayNodeID,
|
|
Endpoint: endpoint,
|
|
Score: score,
|
|
Reasons: scoreReasons,
|
|
})
|
|
}
|
|
if len(relayCandidates) == 0 {
|
|
return rendezvousRelaySelection{}
|
|
}
|
|
sort.SliceStable(relayCandidates, func(i, j int) bool {
|
|
if relayCandidates[i].Score != relayCandidates[j].Score {
|
|
return relayCandidates[i].Score > relayCandidates[j].Score
|
|
}
|
|
return relayCandidates[i].RelayNodeID < relayCandidates[j].RelayNodeID
|
|
})
|
|
return relayCandidates[0]
|
|
}
|
|
|
|
func relayControlEndpointForNode(nodeID string, peers map[string]string, candidates map[string][]PeerEndpointCandidate) (string, int, []string) {
|
|
if endpoint := strings.TrimRight(strings.TrimSpace(peers[nodeID]), "/"); isUsableHTTPControlEndpoint(endpoint) {
|
|
return endpoint, 80, []string{"reported_peer_endpoint"}
|
|
}
|
|
items := append([]PeerEndpointCandidate{}, candidates[nodeID]...)
|
|
sort.SliceStable(items, func(i, j int) bool {
|
|
if items[i].Priority != items[j].Priority {
|
|
return items[i].Priority < items[j].Priority
|
|
}
|
|
return items[i].EndpointID < items[j].EndpointID
|
|
})
|
|
for _, candidate := range items {
|
|
if endpointCandidateRequiresRendezvous(candidate) {
|
|
continue
|
|
}
|
|
endpoint := strings.TrimRight(strings.TrimSpace(candidate.Address), "/")
|
|
if isUsableHTTPControlEndpoint(endpoint) {
|
|
score := 70
|
|
reasons := []string{"endpoint_candidate"}
|
|
if candidate.Priority > 0 {
|
|
score += maxInt(0, 50-candidate.Priority)
|
|
}
|
|
if hasPolicyTag(candidate.PolicyTags, "fast-path") {
|
|
score += 25
|
|
reasons = append(reasons, "fast_path")
|
|
}
|
|
if hasPolicyTag(candidate.PolicyTags, "same-site") || hasPolicyTag(candidate.PolicyTags, "corp-lan") || hasPolicyTag(candidate.PolicyTags, "private-lan") {
|
|
score += 20
|
|
reasons = append(reasons, "same_site")
|
|
}
|
|
if strings.EqualFold(candidate.ConnectivityMode, "direct") {
|
|
score += 10
|
|
reasons = append(reasons, "direct")
|
|
}
|
|
return endpoint, score, reasons
|
|
}
|
|
}
|
|
return "", 0, nil
|
|
}
|
|
|
|
func rendezvousRelayCandidateScore(routeID string, routePath []string, peerIndex int, relayNodeID string, localNodeID string, endpointScore int, endpointReasons []string, relayPolicy *rendezvousRelayPolicy) (int, []string) {
|
|
score := 500 + endpointScore
|
|
reasons := append([]string{}, endpointReasons...)
|
|
relayIndex := -1
|
|
for index, nodeID := range routePath {
|
|
if nodeID == relayNodeID {
|
|
relayIndex = index
|
|
break
|
|
}
|
|
}
|
|
if peerIndex >= 0 && relayIndex >= 0 {
|
|
distance := absInt(peerIndex - relayIndex)
|
|
switch {
|
|
case distance == 1:
|
|
score += 180
|
|
reasons = append(reasons, "adjacent_to_peer")
|
|
case distance == 2:
|
|
score += 120
|
|
reasons = append(reasons, "near_peer")
|
|
default:
|
|
score += maxInt(0, 80-distance*10)
|
|
reasons = append(reasons, "route_path_candidate")
|
|
}
|
|
}
|
|
if relayIndex == 0 && len(routePath) > 2 {
|
|
score -= 120
|
|
reasons = append(reasons, "entry_relay_fallback")
|
|
}
|
|
if relayNodeID == localNodeID {
|
|
score += 40
|
|
reasons = append(reasons, "local_entry_relay")
|
|
}
|
|
linkScore, linkReasons := rendezvousRelayLinkScore(relayNodeID, relayPolicy)
|
|
score += linkScore
|
|
reasons = append(reasons, linkReasons...)
|
|
routeHealthScore, routeHealthReasons := rendezvousRelayRouteHealthScore(routeID, relayNodeID, relayPolicy)
|
|
score += routeHealthScore
|
|
reasons = append(reasons, routeHealthReasons...)
|
|
return score, reasons
|
|
}
|
|
|
|
func rendezvousRelayLinkScore(relayNodeID string, relayPolicy *rendezvousRelayPolicy) (int, []string) {
|
|
if relayPolicy == nil || relayPolicy.localNodeID == "" {
|
|
return 0, nil
|
|
}
|
|
var latest *MeshLinkObservation
|
|
for i := range relayPolicy.links {
|
|
link := &relayPolicy.links[i]
|
|
if link.SourceNodeID != relayPolicy.localNodeID || link.TargetNodeID != relayNodeID {
|
|
continue
|
|
}
|
|
if !link.ObservedAt.IsZero() && relayPolicy.now.Sub(link.ObservedAt.UTC()) > rendezvousRelayFeedbackMaxAge {
|
|
continue
|
|
}
|
|
if latest == nil || link.ObservedAt.After(latest.ObservedAt) {
|
|
latest = link
|
|
}
|
|
}
|
|
if latest == nil {
|
|
return 0, nil
|
|
}
|
|
switch latest.LinkStatus {
|
|
case "reachable":
|
|
score := 60
|
|
reasons := []string{"mesh_link_reachable"}
|
|
if latest.QualityScore != nil {
|
|
score += *latest.QualityScore
|
|
reasons = append(reasons, "mesh_link_quality")
|
|
}
|
|
if latest.LatencyMs != nil {
|
|
score += maxInt(0, 80-*latest.LatencyMs)
|
|
reasons = append(reasons, "mesh_link_latency")
|
|
}
|
|
return score, reasons
|
|
case "unreachable":
|
|
return -250, []string{"mesh_link_unreachable"}
|
|
default:
|
|
return 0, nil
|
|
}
|
|
}
|
|
|
|
func rendezvousRelayRouteHealthScore(routeID string, relayNodeID string, relayPolicy *rendezvousRelayPolicy) (int, []string) {
|
|
if relayPolicy == nil || relayPolicy.localNodeID == "" {
|
|
return 0, nil
|
|
}
|
|
routeID = strings.TrimSpace(routeID)
|
|
relayNodeID = strings.TrimSpace(relayNodeID)
|
|
if routeID == "" || relayNodeID == "" {
|
|
return 0, nil
|
|
}
|
|
var latest *MeshLinkObservation
|
|
var latestMetadata meshRouteHealthObservationMetadata
|
|
for i := range relayPolicy.links {
|
|
link := &relayPolicy.links[i]
|
|
if link.SourceNodeID != relayPolicy.localNodeID || !meshLinkObservationFresh(*link, relayPolicy.now) {
|
|
continue
|
|
}
|
|
metadata, ok := routeHealthMetadataFromLink(*link)
|
|
if !ok ||
|
|
metadata.ObservationType != "synthetic_route_health" ||
|
|
strings.TrimSpace(metadata.RouteID) != routeID ||
|
|
strings.TrimSpace(metadata.RoutePathDecisionSelectedRelayID) != relayNodeID ||
|
|
metadata.ProductionForwarding ||
|
|
metadata.ProductionPayloadForwarding ||
|
|
metadata.RouteHealthProductionPayloadForwarding ||
|
|
metadata.RouteHealthServicePayloadForwarding {
|
|
continue
|
|
}
|
|
if latest == nil || link.ObservedAt.After(latest.ObservedAt) {
|
|
latest = link
|
|
latestMetadata = metadata
|
|
}
|
|
}
|
|
if latest == nil {
|
|
return 0, nil
|
|
}
|
|
if latestMetadata.RoutePathDriftDetected {
|
|
return -360, []string{"route_health_drift"}
|
|
}
|
|
if latest.LinkStatus == "unreachable" || strings.TrimSpace(latestMetadata.FailureReason) != "" {
|
|
return -320, []string{"route_health_unreachable"}
|
|
}
|
|
if latest.LinkStatus != "reachable" {
|
|
return 0, nil
|
|
}
|
|
score := 90
|
|
reasons := []string{"route_health_reachable", "route_health_no_drift"}
|
|
if latest.QualityScore != nil {
|
|
score += *latest.QualityScore
|
|
reasons = append(reasons, "route_health_quality")
|
|
}
|
|
if latest.LatencyMs != nil {
|
|
score += maxInt(0, 100-*latest.LatencyMs)
|
|
reasons = append(reasons, "route_health_latency")
|
|
}
|
|
return score, reasons
|
|
}
|
|
|
|
func rendezvousRelayLeaseMetadata(selection rendezvousRelaySelection, replacement bool) json.RawMessage {
|
|
payload := map[string]any{
|
|
"source": "control-plane",
|
|
"derived_from": "endpoint_candidate",
|
|
"lease_refresh_contract": "node_scoped_synthetic_config_get",
|
|
"relay_replacement_contract": "stale_relay_feedback_policy",
|
|
"relay_selection_score": selection.Score,
|
|
"relay_selection_score_reasons": selection.Reasons,
|
|
"production_payload_forwarding": false,
|
|
}
|
|
if replacement {
|
|
payload["replacement_for_stale_relay"] = true
|
|
}
|
|
raw, err := json.Marshal(payload)
|
|
if err != nil {
|
|
return json.RawMessage(`{"source":"control-plane","derived_from":"endpoint_candidate","lease_refresh_contract":"node_scoped_synthetic_config_get","relay_replacement_contract":"stale_relay_feedback_policy","production_payload_forwarding":false}`)
|
|
}
|
|
return raw
|
|
}
|
|
|
|
func hasPolicyTag(tags []string, want string) bool {
|
|
want = strings.ToLower(strings.TrimSpace(want))
|
|
for _, tag := range tags {
|
|
if strings.ToLower(strings.TrimSpace(tag)) == want {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func maxInt(a int, b int) int {
|
|
if a > b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
func absInt(value int) int {
|
|
if value < 0 {
|
|
return -value
|
|
}
|
|
return value
|
|
}
|
|
|
|
func peerEndpointCandidatesRequireRendezvous(candidates []PeerEndpointCandidate) bool {
|
|
for _, candidate := range candidates {
|
|
if endpointCandidateRequiresRendezvous(candidate) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func endpointCandidateRequiresRendezvous(candidate PeerEndpointCandidate) bool {
|
|
transport := strings.ToLower(strings.TrimSpace(candidate.Transport))
|
|
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
|
|
connectivityMode := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
|
|
return strings.Contains(transport, "relay") ||
|
|
strings.Contains(transport, "outbound") ||
|
|
reachability == "relay" ||
|
|
reachability == "outbound_only" ||
|
|
connectivityMode == "relay_required" ||
|
|
connectivityMode == "outbound_only"
|
|
}
|
|
|
|
func rendezvousLeasePriority(candidates []PeerEndpointCandidate) int {
|
|
priority := 0
|
|
for _, candidate := range candidates {
|
|
if !endpointCandidateRequiresRendezvous(candidate) {
|
|
continue
|
|
}
|
|
if priority == 0 || (candidate.Priority > 0 && candidate.Priority < priority) {
|
|
priority = candidate.Priority
|
|
}
|
|
}
|
|
return priority
|
|
}
|
|
|
|
func rendezvousLeaseReason(candidates []PeerEndpointCandidate) string {
|
|
for _, candidate := range candidates {
|
|
connectivityMode := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
|
|
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
|
|
if connectivityMode == "outbound_only" || reachability == "outbound_only" {
|
|
return "auto_outbound_only"
|
|
}
|
|
if connectivityMode == "relay_required" || reachability == "relay" {
|
|
return "auto_relay_required"
|
|
}
|
|
}
|
|
return "auto_rendezvous_required"
|
|
}
|
|
|
|
func mergeRendezvousLeases(out map[string]PeerRendezvousLease, leases []PeerRendezvousLease) {
|
|
for _, lease := range leases {
|
|
if lease.Metadata == nil {
|
|
lease.Metadata = json.RawMessage(`{}`)
|
|
}
|
|
key := strings.TrimSpace(lease.LeaseID)
|
|
if key == "" {
|
|
key = lease.PeerNodeID + "\x00" + lease.RelayNodeID + "\x00" + lease.RelayEndpoint
|
|
}
|
|
existing, ok := out[key]
|
|
if !ok || lease.Priority < existing.Priority || existing.ExpiresAt.Before(lease.ExpiresAt) {
|
|
out[key] = lease
|
|
}
|
|
}
|
|
}
|
|
|
|
func sortedRendezvousLeases(items map[string]PeerRendezvousLease, limit int) []PeerRendezvousLease {
|
|
out := make([]PeerRendezvousLease, 0, len(items))
|
|
for _, item := range items {
|
|
out = append(out, item)
|
|
}
|
|
sort.SliceStable(out, func(i, j int) bool {
|
|
if out[i].Priority != out[j].Priority {
|
|
return out[i].Priority < out[j].Priority
|
|
}
|
|
if out[i].PeerNodeID != out[j].PeerNodeID {
|
|
return out[i].PeerNodeID < out[j].PeerNodeID
|
|
}
|
|
if out[i].RelayNodeID != out[j].RelayNodeID {
|
|
return out[i].RelayNodeID < out[j].RelayNodeID
|
|
}
|
|
return out[i].LeaseID < out[j].LeaseID
|
|
})
|
|
if len(out) > limit {
|
|
out = out[:limit]
|
|
}
|
|
return out
|
|
}
|
|
|
|
func markPeerDirectoryRendezvousLeases(directory map[string]*PeerDirectoryEntry, leases []PeerRendezvousLease, localNodeID string) {
|
|
for _, lease := range leases {
|
|
if lease.PeerNodeID != "" && lease.PeerNodeID != localNodeID {
|
|
entry := peerDirectoryEntry(directory, lease.PeerNodeID)
|
|
entry.CandidateCount++
|
|
if !containsString(entry.ConnectivityModes, "relay_required") {
|
|
entry.ConnectivityModes = append(entry.ConnectivityModes, "relay_required")
|
|
}
|
|
}
|
|
if lease.RelayNodeID != "" && lease.RelayNodeID != localNodeID {
|
|
entry := peerDirectoryEntry(directory, lease.RelayNodeID)
|
|
entry.EndpointCount++
|
|
if !containsString(entry.ConnectivityModes, "relay_control") {
|
|
entry.ConnectivityModes = append(entry.ConnectivityModes, "relay_control")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func mergePeerDirectoryRoute(directory map[string]*PeerDirectoryEntry, route SyntheticMeshRouteConfig, localNodeID string) {
|
|
for _, nodeID := range route.Hops {
|
|
nodeID = strings.TrimSpace(nodeID)
|
|
if nodeID == "" || nodeID == localNodeID {
|
|
continue
|
|
}
|
|
entry := peerDirectoryEntry(directory, nodeID)
|
|
if !containsString(entry.RouteIDs, route.RouteID) {
|
|
entry.RouteIDs = append(entry.RouteIDs, route.RouteID)
|
|
}
|
|
}
|
|
}
|
|
|
|
func mergePeerDirectoryCandidates(directory map[string]*PeerDirectoryEntry, nodeID string, candidates []PeerEndpointCandidate) {
|
|
entry := peerDirectoryEntry(directory, nodeID)
|
|
entry.CandidateCount += len(candidates)
|
|
for _, candidate := range candidates {
|
|
if strings.TrimSpace(candidate.ConnectivityMode) != "" && !containsString(entry.ConnectivityModes, candidate.ConnectivityMode) {
|
|
entry.ConnectivityModes = append(entry.ConnectivityModes, candidate.ConnectivityMode)
|
|
}
|
|
}
|
|
}
|
|
|
|
func peerDirectoryEntry(directory map[string]*PeerDirectoryEntry, nodeID string) *PeerDirectoryEntry {
|
|
if entry, ok := directory[nodeID]; ok {
|
|
return entry
|
|
}
|
|
entry := &PeerDirectoryEntry{NodeID: nodeID}
|
|
directory[nodeID] = entry
|
|
return entry
|
|
}
|
|
|
|
func mergeRecoverySeeds(out map[string]PeerRecoverySeed, seeds []PeerRecoverySeed) {
|
|
for _, seed := range seeds {
|
|
if seed.Metadata == nil {
|
|
seed.Metadata = json.RawMessage(`{}`)
|
|
}
|
|
key := seed.NodeID + "\x00" + seed.Endpoint
|
|
existing, ok := out[key]
|
|
if !ok || seed.Priority < existing.Priority {
|
|
out[key] = seed
|
|
}
|
|
}
|
|
}
|
|
|
|
func sortedRecoverySeeds(items map[string]PeerRecoverySeed, limit int) []PeerRecoverySeed {
|
|
out := make([]PeerRecoverySeed, 0, len(items))
|
|
for _, item := range items {
|
|
out = append(out, item)
|
|
}
|
|
sort.SliceStable(out, func(i, j int) bool {
|
|
if out[i].Priority != out[j].Priority {
|
|
return out[i].Priority < out[j].Priority
|
|
}
|
|
if out[i].NodeID != out[j].NodeID {
|
|
return out[i].NodeID < out[j].NodeID
|
|
}
|
|
return out[i].Endpoint < out[j].Endpoint
|
|
})
|
|
if len(out) > limit {
|
|
out = out[:limit]
|
|
}
|
|
return out
|
|
}
|
|
|
|
func markPeerDirectoryRecoverySeeds(directory map[string]*PeerDirectoryEntry, seeds []PeerRecoverySeed) {
|
|
for _, seed := range seeds {
|
|
entry := peerDirectoryEntry(directory, seed.NodeID)
|
|
entry.RecoverySeed = true
|
|
if strings.TrimSpace(seed.ConnectivityMode) != "" && !containsString(entry.ConnectivityModes, seed.ConnectivityMode) {
|
|
entry.ConnectivityModes = append(entry.ConnectivityModes, seed.ConnectivityMode)
|
|
}
|
|
}
|
|
}
|
|
|
|
func sortedPeerDirectory(items map[string]*PeerDirectoryEntry) []PeerDirectoryEntry {
|
|
out := make([]PeerDirectoryEntry, 0, len(items))
|
|
for _, entry := range items {
|
|
sort.Strings(entry.RouteIDs)
|
|
sort.Strings(entry.ConnectivityModes)
|
|
if entry.NodeID != "" {
|
|
out = append(out, *entry)
|
|
}
|
|
}
|
|
sort.SliceStable(out, func(i, j int) bool {
|
|
return out[i].NodeID < out[j].NodeID
|
|
})
|
|
return out
|
|
}
|
|
|
|
func validatePeerEndpointCandidates(candidates map[string][]PeerEndpointCandidate, routePath []string) error {
|
|
if len(candidates) == 0 {
|
|
return nil
|
|
}
|
|
for nodeID, items := range candidates {
|
|
if strings.TrimSpace(nodeID) == "" || !containsString(routePath, nodeID) {
|
|
return ErrInvalidPayload
|
|
}
|
|
for _, candidate := range items {
|
|
if strings.TrimSpace(candidate.EndpointID) == "" ||
|
|
strings.TrimSpace(candidate.NodeID) == "" ||
|
|
candidate.NodeID != nodeID ||
|
|
strings.TrimSpace(candidate.Address) == "" ||
|
|
!isPeerEndpointTransport(candidate.Transport) ||
|
|
!isPeerEndpointReachability(candidate.Reachability) ||
|
|
!isPeerEndpointConnectivityMode(candidate.ConnectivityMode) ||
|
|
(candidate.NATType != "" && !isPeerEndpointNATType(candidate.NATType)) {
|
|
return ErrInvalidPayload
|
|
}
|
|
if len(candidate.Metadata) > 0 && !json.Valid(candidate.Metadata) {
|
|
return ErrInvalidPayload
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func scopedPeerEndpoints(peers map[string]string, routePath []string) map[string]string {
|
|
out := map[string]string{}
|
|
for nodeID, endpoint := range peers {
|
|
endpoint = strings.TrimSpace(endpoint)
|
|
if containsString(routePath, nodeID) && endpoint != "" && !isUnusableLocalPeerEndpoint(endpoint) {
|
|
out[nodeID] = endpoint
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func scopedPeerEndpointCandidates(candidates map[string][]PeerEndpointCandidate, routePath []string) map[string][]PeerEndpointCandidate {
|
|
out := map[string][]PeerEndpointCandidate{}
|
|
for nodeID, items := range candidates {
|
|
if !containsString(routePath, nodeID) {
|
|
continue
|
|
}
|
|
for _, candidate := range items {
|
|
if isUnusableLocalPeerEndpoint(candidate.Address) {
|
|
continue
|
|
}
|
|
if candidate.Metadata == nil {
|
|
candidate.Metadata = json.RawMessage(`{}`)
|
|
}
|
|
out[nodeID] = append(out[nodeID], candidate)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func isPeerEndpointTransport(value string) bool {
|
|
switch value {
|
|
case "direct_http", "direct_tcp_tls", "wss", "relay", "outbound_reverse":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func isPeerRendezvousTransport(value string) bool {
|
|
switch value {
|
|
case "relay_control", "relay", "wss", "direct_tcp_tls":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func isPeerEndpointReachability(value string) bool {
|
|
switch value {
|
|
case "public", "private", "relay", "outbound_only", "unknown":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func isPeerEndpointConnectivityMode(value string) bool {
|
|
switch value {
|
|
case "direct", "private_lan", "relay_required", "outbound_only", "unknown":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func isPeerEndpointNATType(value string) bool {
|
|
switch value {
|
|
case "unknown", "none", "full_cone", "restricted", "port_restricted", "symmetric", "blocked":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func controlPlaneAllowedChannels(channels []string) []string {
|
|
out := []string{}
|
|
for _, channel := range channels {
|
|
channel = strings.TrimSpace(channel)
|
|
switch channel {
|
|
case "fabric_control", "route_control":
|
|
if !containsString(out, channel) {
|
|
out = append(out, channel)
|
|
}
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func isHTTPControlEndpoint(endpoint string) bool {
|
|
endpoint = strings.ToLower(strings.TrimSpace(endpoint))
|
|
return strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://")
|
|
}
|
|
|
|
func isUsableHTTPControlEndpoint(endpoint string) bool {
|
|
return isHTTPControlEndpoint(endpoint) && !isUnusableLocalPeerEndpoint(endpoint)
|
|
}
|
|
|
|
func isUnusableLocalPeerEndpoint(endpoint string) bool {
|
|
host := peerEndpointHost(endpoint)
|
|
if host == "" {
|
|
return false
|
|
}
|
|
if strings.EqualFold(host, "localhost") {
|
|
return true
|
|
}
|
|
ip := net.ParseIP(host)
|
|
return ip != nil && (ip.IsLoopback() || ip.IsUnspecified())
|
|
}
|
|
|
|
func peerEndpointHost(endpoint string) string {
|
|
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
|
|
if endpoint == "" {
|
|
return ""
|
|
}
|
|
if host, _, err := net.SplitHostPort(endpoint); err == nil {
|
|
return strings.Trim(host, "[]")
|
|
}
|
|
if parsed, err := url.Parse(endpoint); err == nil && parsed.Host != "" {
|
|
if host, _, err := net.SplitHostPort(parsed.Host); err == nil {
|
|
return strings.Trim(host, "[]")
|
|
}
|
|
return strings.Trim(parsed.Host, "[]")
|
|
}
|
|
return strings.Trim(endpoint, "[]")
|
|
}
|
|
|
|
func firstNodeID(selector nodeSelector) string {
|
|
if strings.TrimSpace(selector.NodeID) != "" {
|
|
return strings.TrimSpace(selector.NodeID)
|
|
}
|
|
for _, nodeID := range selector.NodeIDs {
|
|
if strings.TrimSpace(nodeID) != "" {
|
|
return strings.TrimSpace(nodeID)
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func cleanRouteNodePath(values []string) []string {
|
|
out := make([]string, 0, len(values))
|
|
for _, value := range values {
|
|
value = strings.TrimSpace(value)
|
|
if value != "" {
|
|
out = append(out, value)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func containsString(values []string, needle string) bool {
|
|
needle = strings.TrimSpace(needle)
|
|
if needle == "" {
|
|
return false
|
|
}
|
|
for _, value := range values {
|
|
if strings.TrimSpace(value) == needle {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func appendMissingString(values []string, value string) []string {
|
|
if containsString(values, value) {
|
|
return values
|
|
}
|
|
return append(values, value)
|
|
}
|
|
|
|
func generateFencingToken() (string, error) {
|
|
buf := make([]byte, 32)
|
|
if _, err := rand.Read(buf); err != nil {
|
|
return "", err
|
|
}
|
|
return "rap_vpn_fence_" + hex.EncodeToString(buf), nil
|
|
}
|