5274 lines
207 KiB
Go
5274 lines
207 KiB
Go
package main
|
|
|
|
import (
|
|
"context"
|
|
"crypto/rand"
|
|
"crypto/rsa"
|
|
"crypto/sha256"
|
|
"crypto/tls"
|
|
"crypto/x509"
|
|
"crypto/x509/pkix"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"log"
|
|
"math/big"
|
|
"net"
|
|
"net/http"
|
|
"os"
|
|
"os/exec"
|
|
"os/signal"
|
|
"path/filepath"
|
|
"runtime"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/agent"
|
|
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
|
|
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
|
|
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/config"
|
|
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
|
|
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/hostagent"
|
|
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh"
|
|
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
|
|
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/supervisor"
|
|
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/vpnruntime"
|
|
)
|
|
|
|
const (
|
|
meshRendezvousLeaseRenewalWindow = time.Minute
|
|
meshRendezvousLeaseRefreshBackoff = 30 * time.Second
|
|
meshSyntheticConfigRefreshInterval = 20 * time.Second
|
|
meshRouteHealthFeedbackRefreshBackoff = 5 * time.Second
|
|
maxMeshRendezvousLeaseReportEntries = 20
|
|
maxVPNFabricEndpointHealthReportEntries = 32
|
|
maxVPNFabricEndpointObservationEntries = 256
|
|
vpnFabricEndpointObservationMaxAge = 30 * time.Minute
|
|
meshRendezvousLeaseReportSchema = "c17z18.mesh_rendezvous_lease_report.v1"
|
|
meshRendezvousLeaseTelemetryCapability = "mesh_rendezvous_lease_telemetry"
|
|
meshRendezvousLeaseRefreshCapability = "mesh_rendezvous_lease_refresh_contract"
|
|
meshRendezvousRelayReplacementCapability = "mesh_rendezvous_relay_replacement_contract"
|
|
meshRoutePathDecisionReportSchema = "c17z18.mesh_route_path_decision_report.v1"
|
|
meshRoutePathDecisionCapability = "mesh_route_path_decision_contract"
|
|
meshRouteGenerationReportSchema = "c17z18.mesh_route_generation_report.v1"
|
|
meshRouteGenerationTrackerCapability = "mesh_route_generation_tracker"
|
|
meshRouteHealthConfigReportSchema = "c17z20.mesh_route_health_config_report.v1"
|
|
meshRouteHealthConfigCapability = "mesh_route_health_config_from_path_decisions"
|
|
meshRouteHealthFeedbackRefreshSchema = "c17z20.mesh_route_health_feedback_refresh_report.v1"
|
|
meshRouteHealthFeedbackRefreshCapability = "mesh_route_health_feedback_refresh_contract"
|
|
)
|
|
|
|
func main() {
|
|
cfg, err := config.Load(os.Args[1:], nil)
|
|
if err != nil {
|
|
log.Fatalf("load config: %v", err)
|
|
}
|
|
|
|
signalCtx, stopSignals := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
|
defer stopSignals()
|
|
ctx, cancel := context.WithCancel(signalCtx)
|
|
defer cancel()
|
|
|
|
identity, err := state.LoadOrCreate(cfg.StateDir, cfg.ClusterID, cfg.NodeName)
|
|
if err != nil {
|
|
log.Fatalf("load identity state: %v", err)
|
|
}
|
|
|
|
api := client.New(cfg.BackendURL)
|
|
if identity.NodeID == "" {
|
|
identity, err = ensureApprovedIdentity(ctx, cfg, identity, api)
|
|
if err != nil {
|
|
log.Fatalf("bootstrap node identity: %v", err)
|
|
}
|
|
if identity.NodeID == "" {
|
|
log.Printf("enrollment still pending: join_request_id=%s identity_file=%s", identity.PendingJoinRequestID, filepath.Join(cfg.StateDir, state.FileName))
|
|
return
|
|
}
|
|
}
|
|
|
|
log.Printf("node-agent started: node_id=%s cluster_id=%s backend=%s", identity.NodeID, identity.ClusterID, cfg.BackendURL)
|
|
vpnGateway := &vpnruntime.Gateway{API: api}
|
|
meshState, stopMeshEndpoint, err := startSyntheticMeshEndpoint(ctx, cancel, cfg, identity, api, vpnGateway)
|
|
if err != nil {
|
|
log.Fatalf("start synthetic mesh endpoint: %v", err)
|
|
}
|
|
defer stopMeshEndpoint()
|
|
|
|
supervisor := supervisor.StubSupervisor{
|
|
Version: agent.Version,
|
|
RemoteWorkspaceRealAdapter: supervisor.RemoteWorkspaceRealAdapterConfig{
|
|
EnabledRequested: cfg.RemoteWorkspaceRealAdapterEnabled,
|
|
Command: cfg.RemoteWorkspaceRealAdapterCommand,
|
|
ArgsJSON: cfg.RemoteWorkspaceRealAdapterArgsJSON,
|
|
WorkDir: cfg.RemoteWorkspaceRealAdapterWorkDir,
|
|
},
|
|
}
|
|
startedAt := time.Now().UTC()
|
|
ticker := time.NewTicker(cfg.HeartbeatInterval)
|
|
defer ticker.Stop()
|
|
for {
|
|
flags, err := sendHeartbeat(ctx, api, cfg, identity, meshState)
|
|
if err != nil {
|
|
log.Printf("heartbeat failed: %v", err)
|
|
}
|
|
if flags.Enabled && flags.TelemetryEnabled {
|
|
telemetry := agent.TelemetryPayload(identity, startedAt)
|
|
if telemetry.Payload == nil {
|
|
telemetry.Payload = map[string]any{}
|
|
}
|
|
if meshState != nil && meshState.ServiceChannelAccessStats != nil {
|
|
telemetry.Payload["fabric_service_channel_access_report"] = meshState.ServiceChannelAccessStats.Report(time.Now().UTC())
|
|
}
|
|
if meshState != nil && meshState.RemoteWorkspaceFrameSink != nil {
|
|
telemetry.Payload["remote_workspace_adapter_sink_report"] = meshState.RemoteWorkspaceFrameSink.Report(time.Now().UTC())
|
|
}
|
|
if err := api.ReportTelemetry(ctx, identity.ClusterID, identity.NodeID, telemetry); err != nil {
|
|
log.Printf("telemetry failed: %v", err)
|
|
} else {
|
|
log.Printf("telemetry sent: node_id=%s cluster_id=%s scopes=%v", identity.NodeID, identity.ClusterID, flags.AppliedScopes)
|
|
}
|
|
}
|
|
if cfg.WorkloadSupervisionEnabled {
|
|
if err := reportWorkloadStatus(ctx, api, supervisor, identity, meshState); err != nil {
|
|
log.Printf("workload status failed: %v", err)
|
|
}
|
|
}
|
|
if err := ensureVPNGatewayRuntime(ctx, api, cfg, identity, vpnGateway, meshState); err != nil {
|
|
log.Printf("vpn gateway runtime failed: %v", err)
|
|
}
|
|
if err := reportVPNAssignmentStatus(ctx, api, identity, vpnGateway); err != nil {
|
|
log.Printf("vpn assignment status failed: %v", err)
|
|
}
|
|
logProductionObservationSinkMetrics(meshState)
|
|
if flags.Enabled && flags.SyntheticLinksEnabled {
|
|
if err := api.ReportMeshLink(ctx, identity.ClusterID, agent.MeshSelfObservationPayload(identity)); err != nil {
|
|
log.Printf("mesh self-observation failed: %v", err)
|
|
} else {
|
|
log.Printf("mesh self-observation sent: node_id=%s cluster_id=%s scopes=%v", identity.NodeID, identity.ClusterID, flags.AppliedScopes)
|
|
}
|
|
if err := refreshRendezvousLeasesIfNeeded(ctx, cfg, identity, api, meshState, time.Now().UTC()); err != nil {
|
|
log.Printf("mesh rendezvous lease refresh failed: %v", err)
|
|
}
|
|
if err := refreshSyntheticMeshConfigIfDue(ctx, cfg, identity, api, meshState, time.Now().UTC()); err != nil {
|
|
log.Printf("mesh synthetic config refresh failed: %v", err)
|
|
}
|
|
if err := reportSyntheticRouteHealth(ctx, cfg, api, identity, meshState); err != nil {
|
|
log.Printf("mesh synthetic route health failed: %v", err)
|
|
}
|
|
if err := probeWarmPeerHealth(ctx, api, identity, meshState); err != nil {
|
|
log.Printf("mesh warm peer health failed: %v", err)
|
|
}
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
log.Print("node-agent stopped")
|
|
return
|
|
case <-ticker.C:
|
|
}
|
|
}
|
|
}
|
|
|
|
type joinRequestEnvelope struct {
|
|
ID string `json:"id"`
|
|
}
|
|
|
|
type nodeApprovalAuthorityPayload struct {
|
|
SchemaVersion string `json:"schema_version"`
|
|
ClusterID string `json:"cluster_id"`
|
|
JoinRequestID string `json:"join_request_id"`
|
|
NodeID string `json:"node_id"`
|
|
NodeFingerprint string `json:"node_fingerprint"`
|
|
IdentityStatus string `json:"identity_status"`
|
|
HeartbeatEndpoint string `json:"heartbeat_endpoint"`
|
|
ApprovedByUserID string `json:"approved_by_user_id"`
|
|
IssuedAt time.Time `json:"issued_at"`
|
|
ControlPlaneOnly bool `json:"control_plane_only"`
|
|
ProductionForwarding bool `json:"production_forwarding"`
|
|
}
|
|
|
|
func ensureApprovedIdentity(ctx context.Context, cfg config.Config, identity state.Identity, api *client.Client) (state.Identity, error) {
|
|
clusterID := firstNonEmpty(identity.ClusterID, cfg.ClusterID)
|
|
if clusterID == "" {
|
|
return state.Identity{}, fmt.Errorf("cluster ID is required for enrollment")
|
|
}
|
|
identity.ClusterID = clusterID
|
|
if identity.PendingJoinRequestID == "" {
|
|
if cfg.JoinToken == "" {
|
|
return state.Identity{}, fmt.Errorf("join token is required for first enrollment")
|
|
}
|
|
response, err := api.Enroll(ctx, agent.EnrollmentPayload(clusterID, cfg.JoinToken, identity))
|
|
if err != nil {
|
|
return state.Identity{}, fmt.Errorf("enroll node: %w", err)
|
|
}
|
|
joinRequestID, err := parseJoinRequestID(response.JoinRequest)
|
|
if err != nil {
|
|
return state.Identity{}, err
|
|
}
|
|
identity, err = state.MarkEnrollmentSubmitted(cfg.StateDir, clusterID, joinRequestID)
|
|
if err != nil {
|
|
return state.Identity{}, fmt.Errorf("persist pending enrollment: %w", err)
|
|
}
|
|
log.Printf("enrollment submitted: status=%s join_request_id=%s identity_file=%s", response.Status, joinRequestID, filepath.Join(cfg.StateDir, state.FileName))
|
|
}
|
|
|
|
deadline := time.Time{}
|
|
if cfg.EnrollmentPollTimeout > 0 {
|
|
deadline = time.Now().UTC().Add(cfg.EnrollmentPollTimeout)
|
|
}
|
|
for {
|
|
response, err := api.BootstrapEnrollment(ctx, identity.PendingJoinRequestID, client.EnrollmentBootstrapRequest{
|
|
ClusterID: clusterID,
|
|
NodeFingerprint: identity.NodeFingerprint,
|
|
PublicKey: identity.PublicKey,
|
|
})
|
|
if err == nil {
|
|
switch response.Status {
|
|
case "approved":
|
|
if response.Bootstrap == nil {
|
|
return state.Identity{}, fmt.Errorf("approved enrollment missing bootstrap contract")
|
|
}
|
|
if err := verifyEnrollmentBootstrap(*response.Bootstrap, identity, cfg); err != nil {
|
|
return state.Identity{}, err
|
|
}
|
|
approved, err := state.MarkApprovedWithAuthority(
|
|
cfg.StateDir,
|
|
response.Bootstrap.NodeID,
|
|
response.Bootstrap.ClusterID,
|
|
response.Bootstrap.IdentityStatus,
|
|
response.Bootstrap.ClusterAuthority.PublicKey,
|
|
response.Bootstrap.ClusterAuthority.PublicKeyFingerprint,
|
|
)
|
|
if err != nil {
|
|
return state.Identity{}, fmt.Errorf("persist approved identity: %w", err)
|
|
}
|
|
log.Printf("enrollment approved: node_id=%s cluster_id=%s authority=%s", approved.NodeID, approved.ClusterID, approved.ClusterAuthorityFingerprint)
|
|
return approved, nil
|
|
case "rejected", "cancelled":
|
|
return state.Identity{}, fmt.Errorf("enrollment %s", response.Status)
|
|
default:
|
|
log.Printf("enrollment waiting for approval: status=%s join_request_id=%s", response.Status, identity.PendingJoinRequestID)
|
|
}
|
|
} else {
|
|
log.Printf("enrollment bootstrap poll failed: %v", err)
|
|
}
|
|
if cfg.EnrollmentPollTimeout > 0 && !deadline.IsZero() && !time.Now().UTC().Before(deadline) {
|
|
return identity, nil
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
return state.Identity{}, ctx.Err()
|
|
case <-time.After(cfg.EnrollmentPollInterval):
|
|
}
|
|
}
|
|
}
|
|
|
|
func parseJoinRequestID(raw json.RawMessage) (string, error) {
|
|
var envelope joinRequestEnvelope
|
|
if err := json.Unmarshal(raw, &envelope); err != nil {
|
|
return "", fmt.Errorf("decode join request: %w", err)
|
|
}
|
|
if strings.TrimSpace(envelope.ID) == "" {
|
|
return "", fmt.Errorf("join request id missing from enrollment response")
|
|
}
|
|
return strings.TrimSpace(envelope.ID), nil
|
|
}
|
|
|
|
func verifyEnrollmentBootstrap(bootstrap client.NodeBootstrap, identity state.Identity, cfg config.Config) error {
|
|
if bootstrap.ClusterAuthority == nil {
|
|
return fmt.Errorf("node bootstrap missing cluster authority")
|
|
}
|
|
if bootstrap.AuthoritySignature == nil || rawMessageEmpty(bootstrap.AuthorityPayload) {
|
|
return fmt.Errorf("node bootstrap missing authority payload or signature")
|
|
}
|
|
if bootstrap.ClusterID != identity.ClusterID || bootstrap.NodeID == "" || bootstrap.IdentityStatus == "" {
|
|
return fmt.Errorf("node bootstrap identity mismatch")
|
|
}
|
|
if bootstrap.ClusterAuthority.SchemaVersion != authority.AuthoritySchemaVersion ||
|
|
bootstrap.ClusterAuthority.ClusterID != bootstrap.ClusterID ||
|
|
bootstrap.ClusterAuthority.KeyAlgorithm != authority.AlgorithmEd25519 {
|
|
return fmt.Errorf("node bootstrap cluster authority descriptor mismatch")
|
|
}
|
|
if bootstrap.AuthoritySignature.KeyFingerprint != bootstrap.ClusterAuthority.PublicKeyFingerprint {
|
|
return fmt.Errorf("node bootstrap authority fingerprint mismatch")
|
|
}
|
|
if pinned := firstNonEmpty(identity.ClusterAuthorityFingerprint, cfg.ClusterAuthorityFingerprint); pinned != "" && pinned != bootstrap.ClusterAuthority.PublicKeyFingerprint {
|
|
return fmt.Errorf("node bootstrap pinned authority fingerprint mismatch")
|
|
}
|
|
if pinned := firstNonEmpty(identity.ClusterAuthorityPublicKey, cfg.ClusterAuthorityPublicKey); pinned != "" && pinned != bootstrap.ClusterAuthority.PublicKey {
|
|
return fmt.Errorf("node bootstrap pinned authority public key mismatch")
|
|
}
|
|
signature := authority.Signature{
|
|
SchemaVersion: bootstrap.AuthoritySignature.SchemaVersion,
|
|
Algorithm: bootstrap.AuthoritySignature.Algorithm,
|
|
KeyFingerprint: bootstrap.AuthoritySignature.KeyFingerprint,
|
|
Signature: bootstrap.AuthoritySignature.Signature,
|
|
}
|
|
if err := authority.VerifyRaw(bootstrap.ClusterAuthority.PublicKey, bootstrap.AuthorityPayload, signature); err != nil {
|
|
return fmt.Errorf("verify node bootstrap authority signature: %w", err)
|
|
}
|
|
var payload nodeApprovalAuthorityPayload
|
|
if err := json.Unmarshal(bootstrap.AuthorityPayload, &payload); err != nil {
|
|
return fmt.Errorf("decode node bootstrap authority payload: %w", err)
|
|
}
|
|
if payload.SchemaVersion != "rap.cluster.node_approval.v1" ||
|
|
payload.ClusterID != bootstrap.ClusterID ||
|
|
payload.NodeID != bootstrap.NodeID ||
|
|
payload.NodeFingerprint != identity.NodeFingerprint ||
|
|
payload.IdentityStatus != bootstrap.IdentityStatus ||
|
|
payload.HeartbeatEndpoint != bootstrap.HeartbeatEndpoint ||
|
|
!payload.ControlPlaneOnly ||
|
|
payload.ProductionForwarding {
|
|
return fmt.Errorf("node bootstrap authority payload mismatch")
|
|
}
|
|
if identity.PendingJoinRequestID != "" && payload.JoinRequestID != identity.PendingJoinRequestID {
|
|
return fmt.Errorf("node bootstrap authority payload join request mismatch")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
type syntheticMeshState struct {
|
|
Runtime *mesh.SyntheticRuntime
|
|
Routes []mesh.SyntheticRoute
|
|
RouteHealthRoutes []mesh.SyntheticRoute
|
|
Source string
|
|
PeerCache *mesh.PeerCache
|
|
RendezvousLeases []mesh.PeerRendezvousLease
|
|
RoutePathDecisions *client.RoutePathDecisionReport
|
|
ServiceChannelFeedback *client.FabricServiceChannelFeedbackReport
|
|
ServiceChannelAdaptivePolicy *client.FabricServiceChannelAdaptivePolicy
|
|
ServiceChannelRemediationCommands []client.FabricServiceChannelRemediationCommand
|
|
RouteGenerationTracker *meshRouteGenerationTracker
|
|
ConfigVersion string
|
|
PeerDirectoryVersion string
|
|
PolicyVersion string
|
|
PeerConnections *mesh.PeerConnectionTracker
|
|
PeerConnectionManager *mesh.PeerConnectionManager
|
|
LastPeerRecoveryPlan *mesh.PeerRecoveryPlan
|
|
LastPeerConnectionIntent *mesh.PeerConnectionIntentPlan
|
|
LastConfigRefreshAt time.Time
|
|
LastLeaseRefresh *meshRendezvousLeaseRefreshState
|
|
LeaseRefreshAttempts int
|
|
LeaseRefreshSuccesses int
|
|
LeaseRefreshFailures int
|
|
LastRouteHealthRefresh *meshRouteHealthFeedbackRefreshState
|
|
RouteHealthRefreshAttempts int
|
|
RouteHealthRefreshSuccesses int
|
|
RouteHealthRefreshFailures int
|
|
RouteHealthRefreshSuppressed int
|
|
ProductionObservationSink *mesh.ProductionEnvelopeObservationSink
|
|
ProductionForwardTransport mesh.ProductionForwardTransport
|
|
ProductionForwardingEnabled bool
|
|
VPNFabricInbox *vpnruntime.FabricPacketInbox
|
|
VPNFabricIngress *vpnruntime.FabricClientPacketIngress
|
|
VPNFabricSessionPeers *mesh.FabricSessionPeerManager
|
|
VPNFabricTransport *mesh.WebSocketFabricTransport
|
|
VPNFabricQUICTransport *mesh.QUICFabricTransport
|
|
VPNFabricSessionDialStats *vpnFabricSessionDialStats
|
|
VPNFabricEndpointObservations *vpnFabricEndpointObservationStore
|
|
PeerEndpoints map[string]string
|
|
PeerEndpointCandidates map[string][]mesh.PeerEndpointCandidate
|
|
PeerEndpointObservations map[string]mesh.EndpointCandidateHealthObservation
|
|
VPNGateway *vpnruntime.Gateway
|
|
ServiceChannelAccessStats *fabricServiceChannelAccessStats
|
|
RemoteWorkspaceFrameSink *mesh.RemoteWorkspaceFrameProbeSink
|
|
LastProductionSinkMetrics *mesh.ProductionEnvelopeObservationSinkMetrics
|
|
ListenerReport meshListenerReport
|
|
ListenerConfigKey string
|
|
ListenerRuntimeConfig config.Config
|
|
ListenerHandler *dynamicHTTPHandler
|
|
StopListener func()
|
|
QUICFabricServer *mesh.QUICFabricServer
|
|
QUICFabricListenAddr string
|
|
QUICFabricCertSHA256 string
|
|
QUICFabricError string
|
|
ConfigLoadError string
|
|
}
|
|
|
|
type fabricServiceChannelAccessStats struct {
|
|
Total atomic.Int64
|
|
Signed atomic.Int64
|
|
Introspection atomic.Int64
|
|
LegacyUnsigned atomic.Int64
|
|
BackendFallback atomic.Int64
|
|
BackendFallbackBlocked atomic.Int64
|
|
FabricRouteSendFailure atomic.Int64
|
|
DataPlaneContract atomic.Int64
|
|
LastAcceptedUnixSec atomic.Int64
|
|
LastDataPlaneMode atomic.Value
|
|
LastWorkingData atomic.Value
|
|
LastSteadyState atomic.Value
|
|
LastBackendRelay atomic.Value
|
|
LastLogicalFlowMode atomic.Value
|
|
LastViolationStatus atomic.Value
|
|
LastViolationReason atomic.Value
|
|
}
|
|
|
|
type vpnFabricSessionDialStats struct {
|
|
Attempts atomic.Int64
|
|
Selected atomic.Int64
|
|
CandidateFailures atomic.Int64
|
|
TransportFailures atomic.Int64
|
|
SessionOpenFailures atomic.Int64
|
|
StreamOpenFailures atomic.Int64
|
|
CapacityLimited atomic.Int64
|
|
AllCandidatesFailed atomic.Int64
|
|
QUICSelected atomic.Int64
|
|
WebSocketSelected atomic.Int64
|
|
LegacySelected atomic.Int64
|
|
PinnedCertSelected atomic.Int64
|
|
LastTransport atomic.Value
|
|
LastEndpoint atomic.Value
|
|
LastCapacityEndpoint atomic.Value
|
|
LastCapacityTransport atomic.Value
|
|
LastFailureReason atomic.Value
|
|
LastSelectedUnixSec atomic.Int64
|
|
LastCapacityUnixSec atomic.Int64
|
|
LastFailureUnixSec atomic.Int64
|
|
}
|
|
|
|
type vpnFabricEndpointObservationStore struct {
|
|
reporterNodeID string
|
|
mu sync.Mutex
|
|
observations map[string]mesh.EndpointCandidateHealthObservation
|
|
}
|
|
|
|
func newVPNFabricSessionDialStats() *vpnFabricSessionDialStats {
|
|
return &vpnFabricSessionDialStats{}
|
|
}
|
|
|
|
func newVPNFabricEndpointObservationStore(reporterNodeID ...string) *vpnFabricEndpointObservationStore {
|
|
nodeID := ""
|
|
if len(reporterNodeID) > 0 {
|
|
nodeID = strings.TrimSpace(reporterNodeID[0])
|
|
}
|
|
return &vpnFabricEndpointObservationStore{
|
|
reporterNodeID: nodeID,
|
|
observations: map[string]mesh.EndpointCandidateHealthObservation{},
|
|
}
|
|
}
|
|
|
|
func (s *vpnFabricEndpointObservationStore) Snapshot() map[string]mesh.EndpointCandidateHealthObservation {
|
|
if s == nil {
|
|
return nil
|
|
}
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
s.pruneLocked(time.Now().UTC(), vpnFabricEndpointObservationMaxAge, maxVPNFabricEndpointObservationEntries)
|
|
out := make(map[string]mesh.EndpointCandidateHealthObservation, len(s.observations))
|
|
for key, value := range s.observations {
|
|
out[key] = value
|
|
}
|
|
return out
|
|
}
|
|
|
|
func (s *vpnFabricEndpointObservationStore) pruneLocked(now time.Time, maxAge time.Duration, maxEntries int) {
|
|
if s == nil || len(s.observations) == 0 {
|
|
return
|
|
}
|
|
if !now.IsZero() && maxAge > 0 {
|
|
for endpointID, observation := range s.observations {
|
|
if !observation.ObservedAt.IsZero() && now.Sub(observation.ObservedAt.UTC()) > maxAge {
|
|
delete(s.observations, endpointID)
|
|
}
|
|
}
|
|
}
|
|
if maxEntries <= 0 || len(s.observations) <= maxEntries {
|
|
return
|
|
}
|
|
values := make([]mesh.EndpointCandidateHealthObservation, 0, len(s.observations))
|
|
for _, observation := range s.observations {
|
|
values = append(values, observation)
|
|
}
|
|
sort.SliceStable(values, func(i, j int) bool {
|
|
if !values[i].ObservedAt.Equal(values[j].ObservedAt) {
|
|
return values[i].ObservedAt.After(values[j].ObservedAt)
|
|
}
|
|
return values[i].EndpointID < values[j].EndpointID
|
|
})
|
|
keep := make(map[string]struct{}, maxEntries)
|
|
for _, observation := range values[:maxEntries] {
|
|
keep[observation.EndpointID] = struct{}{}
|
|
}
|
|
for endpointID := range s.observations {
|
|
if _, ok := keep[endpointID]; !ok {
|
|
delete(s.observations, endpointID)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (s *vpnFabricEndpointObservationStore) Report(observedAt time.Time, maxEntries int) map[string]any {
|
|
snapshot := s.Snapshot()
|
|
if len(snapshot) == 0 {
|
|
return map[string]any{
|
|
"schema_version": "rap.vpn_fabric_endpoint_health_report.v1",
|
|
"reporter_node_id": s.reporterNodeID,
|
|
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
|
|
"total": 0,
|
|
"reported": 0,
|
|
"dropped": 0,
|
|
"observations": []mesh.EndpointCandidateHealthObservation{},
|
|
}
|
|
}
|
|
values := make([]mesh.EndpointCandidateHealthObservation, 0, len(snapshot))
|
|
for _, observation := range snapshot {
|
|
values = append(values, observation)
|
|
}
|
|
sort.SliceStable(values, func(i, j int) bool {
|
|
if !values[i].ObservedAt.Equal(values[j].ObservedAt) {
|
|
return values[i].ObservedAt.After(values[j].ObservedAt)
|
|
}
|
|
return values[i].EndpointID < values[j].EndpointID
|
|
})
|
|
if maxEntries <= 0 || maxEntries > len(values) {
|
|
maxEntries = len(values)
|
|
}
|
|
reported := values[:maxEntries]
|
|
return map[string]any{
|
|
"schema_version": "rap.vpn_fabric_endpoint_health_report.v1",
|
|
"reporter_node_id": s.reporterNodeID,
|
|
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
|
|
"total": len(values),
|
|
"reported": len(reported),
|
|
"dropped": len(values) - len(reported),
|
|
"observations": reported,
|
|
}
|
|
}
|
|
|
|
func (s *vpnFabricEndpointObservationStore) ObserveSuccess(endpointID string, latency time.Duration) {
|
|
if s == nil || strings.TrimSpace(endpointID) == "" {
|
|
return
|
|
}
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
observation := s.observations[endpointID]
|
|
observation.EndpointID = endpointID
|
|
observation.Source = "local_vpn_fabric_session"
|
|
observation.ReporterNodeID = s.reporterNodeID
|
|
observation.SuccessCount++
|
|
observation.LastLatencyMs = latency.Milliseconds()
|
|
observation.ReliabilityScore = 100
|
|
observation.LastFailureReason = ""
|
|
observation.ObservedAt = time.Now().UTC()
|
|
s.observations[endpointID] = observation
|
|
s.pruneLocked(observation.ObservedAt, vpnFabricEndpointObservationMaxAge, maxVPNFabricEndpointObservationEntries)
|
|
}
|
|
|
|
func (s *vpnFabricEndpointObservationStore) ObserveFailure(endpointID string, reason string) {
|
|
if s == nil || strings.TrimSpace(endpointID) == "" {
|
|
return
|
|
}
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
observation := s.observations[endpointID]
|
|
observation.EndpointID = endpointID
|
|
observation.Source = "local_vpn_fabric_session"
|
|
observation.ReporterNodeID = s.reporterNodeID
|
|
observation.FailureCount++
|
|
observation.LastFailureReason = strings.TrimSpace(reason)
|
|
observation.ReliabilityScore = 35
|
|
observation.ObservedAt = time.Now().UTC()
|
|
s.observations[endpointID] = observation
|
|
s.pruneLocked(observation.ObservedAt, vpnFabricEndpointObservationMaxAge, maxVPNFabricEndpointObservationEntries)
|
|
}
|
|
|
|
func (s *vpnFabricEndpointObservationStore) ObserveCapacity(endpointID string) {
|
|
if s == nil || strings.TrimSpace(endpointID) == "" {
|
|
return
|
|
}
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
observation := s.observations[endpointID]
|
|
observation.EndpointID = endpointID
|
|
observation.Source = "local_vpn_fabric_session"
|
|
observation.ReporterNodeID = s.reporterNodeID
|
|
observation.LastFailureReason = "capacity_limited"
|
|
if observation.ReliabilityScore <= 0 {
|
|
observation.ReliabilityScore = 90
|
|
}
|
|
observation.ObservedAt = time.Now().UTC()
|
|
s.observations[endpointID] = observation
|
|
s.pruneLocked(observation.ObservedAt, vpnFabricEndpointObservationMaxAge, maxVPNFabricEndpointObservationEntries)
|
|
}
|
|
|
|
func fabricTransportLabelIsQUIC(label string) bool {
|
|
switch strings.ToLower(strings.TrimSpace(label)) {
|
|
case "quic", "direct_quic", "udp_quic", "quic_udp":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func fabricTransportLabelIsWebSocket(label string) bool {
|
|
switch strings.ToLower(strings.TrimSpace(label)) {
|
|
case "websocket", "ws", "wss", "direct_http", "direct_https", "direct_tcp_tls":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func (s *vpnFabricSessionDialStats) ObserveCandidateFailure(reason string) {
|
|
if s == nil {
|
|
return
|
|
}
|
|
s.CandidateFailures.Add(1)
|
|
switch strings.TrimSpace(reason) {
|
|
case "transport_select_failed":
|
|
s.TransportFailures.Add(1)
|
|
case "session_open_failed":
|
|
s.SessionOpenFailures.Add(1)
|
|
case "stream_open_failed":
|
|
s.StreamOpenFailures.Add(1)
|
|
case "capacity_limited":
|
|
s.CapacityLimited.Add(1)
|
|
}
|
|
s.LastFailureReason.Store(strings.TrimSpace(reason))
|
|
s.LastFailureUnixSec.Store(time.Now().UTC().Unix())
|
|
}
|
|
|
|
func (s *vpnFabricSessionDialStats) ObserveCapacityLimited(target mesh.FabricTransportTarget) {
|
|
if s == nil {
|
|
return
|
|
}
|
|
s.ObserveCandidateFailure("capacity_limited")
|
|
s.LastCapacityEndpoint.Store(strings.TrimSpace(target.Endpoint))
|
|
transport := strings.TrimSpace(target.Transport)
|
|
if transport == "" {
|
|
transport = "legacy_peer_endpoint"
|
|
}
|
|
s.LastCapacityTransport.Store(transport)
|
|
s.LastCapacityUnixSec.Store(time.Now().UTC().Unix())
|
|
}
|
|
|
|
func (s *vpnFabricSessionDialStats) ObserveAllCandidatesFailed() {
|
|
if s == nil {
|
|
return
|
|
}
|
|
s.AllCandidatesFailed.Add(1)
|
|
s.LastFailureReason.Store("all_candidates_failed")
|
|
s.LastFailureUnixSec.Store(time.Now().UTC().Unix())
|
|
}
|
|
|
|
func (s *vpnFabricSessionDialStats) ObserveSelected(target mesh.FabricTransportTarget) {
|
|
if s == nil {
|
|
return
|
|
}
|
|
s.Selected.Add(1)
|
|
transport := strings.TrimSpace(target.Transport)
|
|
if transport == "" {
|
|
transport = "legacy_peer_endpoint"
|
|
}
|
|
s.LastTransport.Store(transport)
|
|
s.LastEndpoint.Store(strings.TrimSpace(target.Endpoint))
|
|
s.LastSelectedUnixSec.Store(time.Now().UTC().Unix())
|
|
switch {
|
|
case fabricTransportLabelIsQUIC(transport):
|
|
s.QUICSelected.Add(1)
|
|
case transport == "legacy_peer_endpoint":
|
|
s.LegacySelected.Add(1)
|
|
case fabricTransportLabelIsWebSocket(transport):
|
|
s.WebSocketSelected.Add(1)
|
|
}
|
|
if strings.TrimSpace(target.PeerCertSHA256) != "" {
|
|
s.PinnedCertSelected.Add(1)
|
|
}
|
|
}
|
|
|
|
func (s *vpnFabricSessionDialStats) Report(observedAt time.Time) map[string]any {
|
|
if s == nil {
|
|
return nil
|
|
}
|
|
report := map[string]any{
|
|
"schema_version": "rap.vpn_fabric_session_dial_stats.v1",
|
|
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
|
|
"attempts": s.Attempts.Load(),
|
|
"selected": s.Selected.Load(),
|
|
"candidate_failures": s.CandidateFailures.Load(),
|
|
"transport_failures": s.TransportFailures.Load(),
|
|
"session_open_failures": s.SessionOpenFailures.Load(),
|
|
"stream_open_failures": s.StreamOpenFailures.Load(),
|
|
"capacity_limited": s.CapacityLimited.Load(),
|
|
"all_candidates_failed": s.AllCandidatesFailed.Load(),
|
|
"quic_selected": s.QUICSelected.Load(),
|
|
"websocket_selected": s.WebSocketSelected.Load(),
|
|
"legacy_selected": s.LegacySelected.Load(),
|
|
"pinned_cert_selected": s.PinnedCertSelected.Load(),
|
|
"last_selected_unix_sec": s.LastSelectedUnixSec.Load(),
|
|
"last_capacity_unix_sec": s.LastCapacityUnixSec.Load(),
|
|
"last_failure_unix_sec": s.LastFailureUnixSec.Load(),
|
|
}
|
|
if value, ok := s.LastTransport.Load().(string); ok && value != "" {
|
|
report["last_transport"] = value
|
|
}
|
|
if value, ok := s.LastEndpoint.Load().(string); ok && value != "" {
|
|
report["last_endpoint"] = value
|
|
}
|
|
if value, ok := s.LastCapacityEndpoint.Load().(string); ok && value != "" {
|
|
report["last_capacity_endpoint"] = value
|
|
}
|
|
if value, ok := s.LastCapacityTransport.Load().(string); ok && value != "" {
|
|
report["last_capacity_transport"] = value
|
|
}
|
|
if value, ok := s.LastFailureReason.Load().(string); ok && value != "" {
|
|
report["last_failure_reason"] = value
|
|
}
|
|
return report
|
|
}
|
|
|
|
func newFabricServiceChannelAccessStats() *fabricServiceChannelAccessStats {
|
|
return &fabricServiceChannelAccessStats{}
|
|
}
|
|
|
|
func (s *fabricServiceChannelAccessStats) Observe(entry mesh.FabricServiceChannelAccessLogEntry) {
|
|
if s == nil {
|
|
return
|
|
}
|
|
s.Total.Add(1)
|
|
switch strings.TrimSpace(entry.AcceptedBy) {
|
|
case "signed":
|
|
s.Signed.Add(1)
|
|
case "introspection":
|
|
s.Introspection.Add(1)
|
|
case "legacy_unsigned":
|
|
s.LegacyUnsigned.Add(1)
|
|
}
|
|
if entry.ForceBackendFallback && strings.TrimSpace(entry.BackendRelayPolicy) != "disabled" {
|
|
s.BackendFallback.Add(1)
|
|
}
|
|
switch strings.TrimSpace(entry.ViolationStatus) {
|
|
case "backend_fallback_blocked_by_policy":
|
|
s.BackendFallbackBlocked.Add(1)
|
|
case "fabric_route_send_failed_backend_fallback_blocked":
|
|
s.BackendFallbackBlocked.Add(1)
|
|
s.FabricRouteSendFailure.Add(1)
|
|
}
|
|
if strings.TrimSpace(entry.ViolationStatus) != "" {
|
|
s.LastViolationStatus.Store(strings.TrimSpace(entry.ViolationStatus))
|
|
s.LastViolationReason.Store(strings.TrimSpace(entry.ViolationReason))
|
|
}
|
|
if entry.DataPlaneValid {
|
|
s.DataPlaneContract.Add(1)
|
|
s.LastDataPlaneMode.Store(strings.TrimSpace(entry.DataPlaneMode))
|
|
s.LastWorkingData.Store(strings.TrimSpace(entry.WorkingDataTransport))
|
|
s.LastSteadyState.Store(strings.TrimSpace(entry.SteadyStateTransport))
|
|
s.LastBackendRelay.Store(strings.TrimSpace(entry.BackendRelayPolicy))
|
|
s.LastLogicalFlowMode.Store(strings.TrimSpace(entry.LogicalFlowMode))
|
|
}
|
|
occurredAt := entry.OccurredAt
|
|
if occurredAt.IsZero() {
|
|
occurredAt = time.Now().UTC()
|
|
}
|
|
s.LastAcceptedUnixSec.Store(occurredAt.Unix())
|
|
}
|
|
|
|
func (s *fabricServiceChannelAccessStats) Report(observedAt time.Time) map[string]any {
|
|
if s == nil {
|
|
return nil
|
|
}
|
|
if observedAt.IsZero() {
|
|
observedAt = time.Now().UTC()
|
|
}
|
|
report := map[string]any{
|
|
"schema_version": "c18z52.fabric_service_channel_access_report.v1",
|
|
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
|
|
"total": s.Total.Load(),
|
|
"signed": s.Signed.Load(),
|
|
"introspection": s.Introspection.Load(),
|
|
"legacy_unsigned": s.LegacyUnsigned.Load(),
|
|
"backend_fallback": s.BackendFallback.Load(),
|
|
"backend_fallback_blocked": s.BackendFallbackBlocked.Load(),
|
|
"fabric_route_send_failure": s.FabricRouteSendFailure.Load(),
|
|
"data_plane_contract": s.DataPlaneContract.Load(),
|
|
"accepted_by_signed": s.Signed.Load(),
|
|
"accepted_by_introspection": s.Introspection.Load(),
|
|
"accepted_by_legacy_unsigned": s.LegacyUnsigned.Load(),
|
|
}
|
|
if value, ok := s.LastDataPlaneMode.Load().(string); ok && value != "" {
|
|
report["last_data_plane_mode"] = value
|
|
}
|
|
if value, ok := s.LastWorkingData.Load().(string); ok && value != "" {
|
|
report["last_working_data_transport"] = value
|
|
}
|
|
if value, ok := s.LastSteadyState.Load().(string); ok && value != "" {
|
|
report["last_steady_state_transport"] = value
|
|
}
|
|
if value, ok := s.LastBackendRelay.Load().(string); ok && value != "" {
|
|
report["last_backend_relay_policy"] = value
|
|
}
|
|
if value, ok := s.LastLogicalFlowMode.Load().(string); ok && value != "" {
|
|
report["last_logical_flow_mode"] = value
|
|
}
|
|
if value, ok := s.LastViolationStatus.Load().(string); ok && value != "" {
|
|
report["last_data_plane_violation_status"] = value
|
|
}
|
|
if value, ok := s.LastViolationReason.Load().(string); ok && value != "" {
|
|
report["last_data_plane_violation_reason"] = value
|
|
}
|
|
if last := s.LastAcceptedUnixSec.Load(); last > 0 {
|
|
report["last_accepted_at"] = time.Unix(last, 0).UTC().Format(time.RFC3339Nano)
|
|
}
|
|
return report
|
|
}
|
|
|
|
type dynamicHTTPHandler struct {
|
|
current atomic.Value
|
|
}
|
|
|
|
func newDynamicHTTPHandler(handler http.Handler) *dynamicHTTPHandler {
|
|
out := &dynamicHTTPHandler{}
|
|
out.Update(handler)
|
|
return out
|
|
}
|
|
|
|
func (h *dynamicHTTPHandler) Update(handler http.Handler) {
|
|
if handler == nil {
|
|
handler = http.NotFoundHandler()
|
|
}
|
|
h.current.Store(handler)
|
|
}
|
|
|
|
func (h *dynamicHTTPHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
|
if h == nil {
|
|
http.NotFound(w, r)
|
|
return
|
|
}
|
|
handler, _ := h.current.Load().(http.Handler)
|
|
if handler == nil {
|
|
http.NotFound(w, r)
|
|
return
|
|
}
|
|
handler.ServeHTTP(w, r)
|
|
}
|
|
|
|
type meshListenerReport struct {
|
|
SchemaVersion string `json:"schema_version"`
|
|
ConfiguredListenAddr string `json:"configured_listen_addr,omitempty"`
|
|
EffectiveListenAddr string `json:"effective_listen_addr,omitempty"`
|
|
ListenPortMode string `json:"listen_port_mode"`
|
|
Status string `json:"status"`
|
|
InboundReachability string `json:"inbound_reachability"`
|
|
ControlPlaneReachable bool `json:"control_plane_reachable"`
|
|
OneWayConnectivity bool `json:"one_way_connectivity"`
|
|
FailureReason string `json:"failure_reason,omitempty"`
|
|
FailureError string `json:"failure_error,omitempty"`
|
|
PortConflict bool `json:"port_conflict,omitempty"`
|
|
AutoPortSelected bool `json:"auto_port_selected,omitempty"`
|
|
ObservedAt string `json:"observed_at"`
|
|
}
|
|
|
|
type meshOutboundSessionReport struct {
|
|
SchemaVersion string `json:"schema_version"`
|
|
Status string `json:"status"`
|
|
Direction string `json:"direction"`
|
|
Transport string `json:"transport"`
|
|
ControlPlaneURL string `json:"control_plane_url,omitempty"`
|
|
ConnectivityMode string `json:"connectivity_mode,omitempty"`
|
|
InboundListenerRequired bool `json:"inbound_listener_required"`
|
|
UsableForInboundControl bool `json:"usable_for_inbound_control"`
|
|
ListenerStatus string `json:"listener_status,omitempty"`
|
|
ListenerFailureReason string `json:"listener_failure_reason,omitempty"`
|
|
ListenerPortConflict bool `json:"listener_port_conflict,omitempty"`
|
|
ConfigLoadError string `json:"config_load_error,omitempty"`
|
|
PeerConnectionReady int `json:"peer_connection_ready"`
|
|
PeerConnectionRelayReady int `json:"peer_connection_relay_ready"`
|
|
PeerConnectionWaiting int `json:"peer_connection_waiting"`
|
|
RendezvousLeaseCount int `json:"rendezvous_lease_count"`
|
|
ProductionForwarding bool `json:"production_forwarding"`
|
|
ServiceWorkloadTraffic bool `json:"service_workload_traffic"`
|
|
ObservedAt string `json:"observed_at"`
|
|
}
|
|
|
|
type meshRendezvousLeaseRefreshState struct {
|
|
Status string
|
|
Reason string
|
|
Error string
|
|
AttemptedAt time.Time
|
|
CompletedAt time.Time
|
|
PreviousLeaseCount int
|
|
RefreshedLeaseCount int
|
|
RefreshNeededCount int
|
|
RenewalNeededCount int
|
|
ExpiredCount int
|
|
StaleRelayCount int
|
|
ConfigVersion string
|
|
}
|
|
|
|
type meshRouteHealthFeedbackTrigger struct {
|
|
Reason string
|
|
RouteID string
|
|
PeerNodeID string
|
|
SelectedRelayID string
|
|
LinkStatus string
|
|
FailureReason string
|
|
DriftDetected bool
|
|
ObservedAt time.Time
|
|
}
|
|
|
|
type meshRouteHealthFeedbackRefreshState struct {
|
|
Status string
|
|
Reason string
|
|
Error string
|
|
AttemptedAt time.Time
|
|
CompletedAt time.Time
|
|
RouteID string
|
|
PeerNodeID string
|
|
SelectedRelayID string
|
|
LinkStatus string
|
|
FailureReason string
|
|
DriftDetected bool
|
|
PreviousConfigVersion string
|
|
RefreshedConfigVersion string
|
|
PreviousRouteHealthRouteCount int
|
|
RefreshedRouteHealthRouteCount int
|
|
}
|
|
|
|
type loadedSyntheticMeshConfig struct {
|
|
PeerEndpoints map[string]string
|
|
PeerEndpointCandidates map[string][]mesh.PeerEndpointCandidate
|
|
PeerEndpointObservations map[string]mesh.EndpointCandidateHealthObservation
|
|
PeerDirectory []mesh.PeerDirectoryEntry
|
|
RecoverySeeds []mesh.PeerRecoverySeed
|
|
RendezvousLeases []mesh.PeerRendezvousLease
|
|
RoutePathDecisions *client.RoutePathDecisionReport
|
|
ServiceChannelFeedback *client.FabricServiceChannelFeedbackReport
|
|
ServiceChannelRemediationCommands []client.FabricServiceChannelRemediationCommand
|
|
ServiceChannelAdaptivePolicy *client.FabricServiceChannelAdaptivePolicy
|
|
MeshListener *client.MeshListenerConfig
|
|
Routes []mesh.SyntheticRoute
|
|
Source string
|
|
ConfigVersion string
|
|
PeerDirectoryVersion string
|
|
PolicyVersion string
|
|
ProductionForwarding bool
|
|
}
|
|
|
|
func startSyntheticMeshEndpoint(ctx context.Context, _ context.CancelFunc, cfg config.Config, identity state.Identity, api *client.Client, vpnGateway *vpnruntime.Gateway) (*syntheticMeshState, func(), error) {
|
|
noop := func() {}
|
|
if !cfg.MeshSyntheticRuntimeEnabled {
|
|
return nil, noop, nil
|
|
}
|
|
local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID}
|
|
loadedConfig, err := loadSyntheticMeshConfig(ctx, cfg, identity, api)
|
|
if err != nil {
|
|
log.Printf("synthetic mesh config load failed; starting diagnostics-only mesh state: %v", err)
|
|
loadedConfig = loadedSyntheticMeshConfig{
|
|
PeerEndpoints: map[string]string{},
|
|
PeerEndpointCandidates: map[string][]mesh.PeerEndpointCandidate{},
|
|
PeerEndpointObservations: map[string]mesh.EndpointCandidateHealthObservation{},
|
|
PeerDirectory: []mesh.PeerDirectoryEntry{},
|
|
RecoverySeeds: []mesh.PeerRecoverySeed{},
|
|
RendezvousLeases: []mesh.PeerRendezvousLease{},
|
|
Routes: []mesh.SyntheticRoute{},
|
|
Source: "config_load_failed",
|
|
}
|
|
}
|
|
peerEndpoints := loadedConfig.PeerEndpoints
|
|
routes := loadedConfig.Routes
|
|
productionForwardingEnabled := cfg.MeshProductionForwardingEnabled || loadedConfig.ProductionForwarding
|
|
routeHealthRoutes := routeHealthRoutesFromPathDecisions(routes, loadedConfig.RoutePathDecisions)
|
|
peerCache := mesh.NewPeerCache(mesh.PeerCacheConfig{
|
|
Local: local,
|
|
PeerEndpoints: loadedConfig.PeerEndpoints,
|
|
PeerEndpointCandidates: loadedConfig.PeerEndpointCandidates,
|
|
PeerEndpointObservations: loadedConfig.PeerEndpointObservations,
|
|
PeerDirectory: loadedConfig.PeerDirectory,
|
|
RecoverySeeds: loadedConfig.RecoverySeeds,
|
|
RendezvousLeases: loadedConfig.RendezvousLeases,
|
|
Routes: loadedConfig.Routes,
|
|
WarmPeerLimit: mesh.DefaultWarmPeerLimit,
|
|
PreferredRegion: cfg.MeshRegion,
|
|
Now: time.Now().UTC(),
|
|
})
|
|
peerCacheSnapshot := peerCache.Snapshot()
|
|
peerConnections := mesh.NewPeerConnectionTracker(peerCacheSnapshot, time.Now().UTC())
|
|
peerConnectionSnapshot := peerConnections.Snapshot()
|
|
peerRecoveryPlan := mesh.PlanPeerRecovery(mesh.PeerRecoveryPlanConfig{
|
|
PeerCache: peerCacheSnapshot,
|
|
Connections: peerConnectionSnapshot,
|
|
TargetReadyPeers: mesh.DefaultStablePeerTarget,
|
|
MaxProbeCandidates: mesh.DefaultRecoveryProbeLimit,
|
|
Now: time.Now().UTC(),
|
|
})
|
|
peerConnectionIntentPlan := mesh.PlanPeerConnectionIntents(mesh.PeerConnectionIntentPlanConfig{
|
|
PeerCache: peerCacheSnapshot,
|
|
RecoveryPlan: peerRecoveryPlan,
|
|
RendezvousLeases: loadedConfig.RendezvousLeases,
|
|
Now: time.Now().UTC(),
|
|
})
|
|
peerConnectionManager := mesh.NewPeerConnectionManager(mesh.PeerConnectionManagerConfig{
|
|
Local: local,
|
|
PeerCache: peerCache,
|
|
Tracker: peerConnections,
|
|
RendezvousLeases: loadedConfig.RendezvousLeases,
|
|
})
|
|
routeGenerationTracker := newMeshRouteGenerationTracker(loadedConfig.RoutePathDecisions, time.Now().UTC())
|
|
gateEnabled, runtimeEnabled := productionForwardingLogState(cfg, loadedConfig.ProductionForwarding)
|
|
log.Printf(
|
|
"synthetic mesh config loaded: source=%s node_id=%s cluster_id=%s peers=%d routes=%d peer_cache_peers=%d warm_peers=%d recovery_seeds=%d rendezvous_leases=%d peer_connection_states=%d peer_recovery_mode=%s peer_recovery_target_ready_peers=%d peer_connection_intents=%d rendezvous_required=%d rendezvous_resolved=%d production_forwarding_gate_enabled=%t production_forwarding_runtime_enabled=%t",
|
|
loadedConfig.Source,
|
|
identity.NodeID,
|
|
identity.ClusterID,
|
|
len(peerEndpoints),
|
|
len(routes),
|
|
peerCacheSnapshot.PeerCount,
|
|
peerCacheSnapshot.WarmPeerCount,
|
|
peerCacheSnapshot.RecoverySeedCount,
|
|
peerCacheSnapshot.RendezvousLeaseCount,
|
|
peerConnectionSnapshot.Total,
|
|
peerRecoveryPlan.Mode,
|
|
peerRecoveryPlan.TargetReadyPeers,
|
|
peerConnectionIntentPlan.IntentCount,
|
|
peerConnectionIntentPlan.RendezvousRequiredCount,
|
|
peerConnectionIntentPlan.RendezvousResolvedCount,
|
|
gateEnabled,
|
|
runtimeEnabled,
|
|
)
|
|
runtime := mesh.NewSyntheticRuntime(mesh.SyntheticRuntimeConfig{
|
|
Enabled: true,
|
|
Local: local,
|
|
Routes: routes,
|
|
RouteHealthRoutes: routeHealthRoutes,
|
|
Transport: mesh.NewHTTPPeerTransport(peerEndpoints),
|
|
Logger: func(entry mesh.SyntheticLogEntry) {
|
|
payload, err := json.Marshal(entry)
|
|
if err != nil {
|
|
log.Printf("mesh synthetic event marshal failed: %v", err)
|
|
return
|
|
}
|
|
log.Printf("mesh_synthetic_event=%s", string(payload))
|
|
},
|
|
})
|
|
productionObservationSink := productionEnvelopeObservationSinkFromConfig(cfg)
|
|
var productionEnvelopeObserver mesh.ProductionEnvelopeObserver
|
|
if productionObservationSink != nil {
|
|
productionEnvelopeObserver = productionObservationSink.Observe
|
|
}
|
|
var productionForwardTransport mesh.ProductionForwardTransport
|
|
if productionForwardingEnabled {
|
|
productionForwardTransport = mesh.NewHTTPProductionForwardTransport(peerEndpoints)
|
|
}
|
|
vpnFabricInbox := vpnruntime.NewFabricPacketInbox(4096)
|
|
serviceChannelAccessStats := newFabricServiceChannelAccessStats()
|
|
remoteWorkspaceFrameSink := mesh.NewRemoteWorkspaceFrameProbeSink()
|
|
vpnFabricIngress := &vpnruntime.FabricClientPacketIngress{
|
|
ForwardTransport: productionForwardTransport,
|
|
Inbox: vpnFabricInbox,
|
|
FlowScheduler: vpnruntime.NewFabricFlowScheduler(0, 0),
|
|
MaxParallelFlowSends: 4,
|
|
ClusterID: identity.ClusterID,
|
|
LocalNodeID: identity.NodeID,
|
|
LocalGateway: func(vpnConnectionID string) bool {
|
|
return vpnGateway != nil && vpnGateway.IsReadyForConnection(vpnConnectionID)
|
|
},
|
|
Routes: func() []mesh.SyntheticRoute {
|
|
return routes
|
|
},
|
|
}
|
|
initialRouteManagerAt := time.Now().UTC()
|
|
vpnFabricIngress.UpdateRouteManager(routeManagerDecisionsFromControlPlane(loadedConfig.RoutePathDecisions, loadedConfig.ServiceChannelRemediationCommands), loadedConfig.ConfigVersion, initialRouteManagerAt)
|
|
vpnFabricIngress.UpdateRouteQualityPreferences(routeQualityPreferencesFromServiceChannelFeedback(loadedConfig.ServiceChannelFeedback, initialRouteManagerAt), initialRouteManagerAt)
|
|
serverHandler := mesh.Server{
|
|
Local: local,
|
|
SyntheticRuntime: runtime,
|
|
ProductionForwardingEnabled: productionForwardingEnabled,
|
|
ProductionEnvelopeObserver: productionEnvelopeObserver,
|
|
ProductionEnvelopeDelivery: vpnFabricInbox.DeliverProductionEnvelope,
|
|
ProductionForwardTransport: productionForwardTransport,
|
|
ProductionForwardLogger: func(entry mesh.ProductionForwardLogEntry) {
|
|
payload, err := json.Marshal(entry)
|
|
if err != nil {
|
|
log.Printf("mesh production forward event marshal failed: %v", err)
|
|
return
|
|
}
|
|
log.Printf("mesh_production_forward_event=%s", string(payload))
|
|
},
|
|
FabricServiceChannelLogger: func(entry mesh.FabricServiceChannelAccessLogEntry) {
|
|
serviceChannelAccessStats.Observe(entry)
|
|
payload, err := json.Marshal(entry)
|
|
if err != nil {
|
|
log.Printf("fabric service channel access event marshal failed: %v", err)
|
|
return
|
|
}
|
|
log.Printf("fabric_service_channel_access_event=%s", string(payload))
|
|
},
|
|
FabricSessionEnabled: cfg.MeshFabricSessionEnabled,
|
|
FabricSessionLogger: func(entry mesh.FabricSessionEventLogEntry) {
|
|
payload, err := json.Marshal(entry)
|
|
if err != nil {
|
|
log.Printf("fabric session event marshal failed: %v", err)
|
|
return
|
|
}
|
|
log.Printf("fabric_session_event=%s", string(payload))
|
|
},
|
|
RemoteWorkspaceFrameSink: remoteWorkspaceFrameSink,
|
|
ProductionRoutes: routes,
|
|
VPNPacketIngress: vpnFabricIngress,
|
|
BackendProxyBaseURL: cfg.BackendURL,
|
|
ClusterAuthorityPublicKey: firstNonEmpty(identity.ClusterAuthorityPublicKey, cfg.ClusterAuthorityPublicKey),
|
|
}.Handler()
|
|
dynamicListenerHandler := newDynamicHTTPHandler(serverHandler)
|
|
listenerCfg := meshListenerRuntimeConfig(cfg, loadedConfig.MeshListener)
|
|
listenerReport, stopListener := startSyntheticMeshHTTPServer(ctx, listenerCfg, identity, dynamicListenerHandler, len(peerEndpoints), len(routes), gateEnabled, runtimeEnabled)
|
|
vpnFabricSessionPeers := mesh.NewFabricSessionPeerManager()
|
|
quicFabricServer, quicFabricAddr, quicFabricCertSHA256, quicFabricErr := startQUICFabricEndpoint(ctx, cfg, identity)
|
|
return &syntheticMeshState{
|
|
Runtime: runtime,
|
|
Routes: routes,
|
|
RouteHealthRoutes: routeHealthRoutes,
|
|
Source: loadedConfig.Source,
|
|
PeerCache: peerCache,
|
|
RendezvousLeases: loadedConfig.RendezvousLeases,
|
|
RoutePathDecisions: loadedConfig.RoutePathDecisions,
|
|
ServiceChannelFeedback: loadedConfig.ServiceChannelFeedback,
|
|
ServiceChannelRemediationCommands: append([]client.FabricServiceChannelRemediationCommand{}, loadedConfig.ServiceChannelRemediationCommands...),
|
|
RouteGenerationTracker: routeGenerationTracker,
|
|
ConfigVersion: loadedConfig.ConfigVersion,
|
|
PeerDirectoryVersion: loadedConfig.PeerDirectoryVersion,
|
|
PolicyVersion: loadedConfig.PolicyVersion,
|
|
LastConfigRefreshAt: time.Now().UTC(),
|
|
PeerConnections: peerConnections,
|
|
PeerConnectionManager: peerConnectionManager,
|
|
LastPeerRecoveryPlan: &peerRecoveryPlan,
|
|
LastPeerConnectionIntent: &peerConnectionIntentPlan,
|
|
ProductionObservationSink: productionObservationSink,
|
|
ProductionForwardTransport: productionForwardTransport,
|
|
ProductionForwardingEnabled: productionForwardingEnabled,
|
|
VPNFabricInbox: vpnFabricInbox,
|
|
VPNFabricIngress: vpnFabricIngress,
|
|
VPNFabricSessionPeers: vpnFabricSessionPeers,
|
|
VPNFabricTransport: mesh.NewWebSocketFabricTransport(vpnFabricSessionPeers),
|
|
VPNFabricQUICTransport: newVPNFabricQUICTransport(cfg),
|
|
VPNFabricSessionDialStats: newVPNFabricSessionDialStats(),
|
|
VPNFabricEndpointObservations: newVPNFabricEndpointObservationStore(identity.NodeID),
|
|
PeerEndpoints: copyStringMap(peerEndpoints),
|
|
PeerEndpointCandidates: copyPeerEndpointCandidatesMap(loadedConfig.PeerEndpointCandidates),
|
|
PeerEndpointObservations: copyEndpointCandidateObservations(loadedConfig.PeerEndpointObservations),
|
|
VPNGateway: vpnGateway,
|
|
ServiceChannelAccessStats: serviceChannelAccessStats,
|
|
RemoteWorkspaceFrameSink: remoteWorkspaceFrameSink,
|
|
ListenerReport: listenerReport,
|
|
ListenerConfigKey: meshListenerConfigKey(listenerCfg),
|
|
ListenerRuntimeConfig: listenerCfg,
|
|
ListenerHandler: dynamicListenerHandler,
|
|
StopListener: stopListener,
|
|
QUICFabricServer: quicFabricServer,
|
|
QUICFabricListenAddr: quicFabricAddr,
|
|
QUICFabricCertSHA256: quicFabricCertSHA256,
|
|
QUICFabricError: errorString(quicFabricErr),
|
|
ConfigLoadError: errorString(err),
|
|
}, stopListener, nil
|
|
}
|
|
|
|
func productionForwardingLogState(cfg config.Config, signedControlPlaneEnabled bool) (gateEnabled bool, runtimeEnabled bool) {
|
|
enabled := cfg.MeshProductionForwardingEnabled || signedControlPlaneEnabled
|
|
return enabled, enabled
|
|
}
|
|
|
|
func newVPNFabricIngress(meshState *syntheticMeshState, identity state.Identity, routes []mesh.SyntheticRoute, decisions *client.RoutePathDecisionReport, remediationCommands []client.FabricServiceChannelRemediationCommand, serviceChannelFeedback *client.FabricServiceChannelFeedbackReport, adaptivePolicy *client.FabricServiceChannelAdaptivePolicy, configVersion string, vpnGateway *vpnruntime.Gateway) *vpnruntime.FabricClientPacketIngress {
|
|
if meshState == nil || meshState.VPNFabricInbox == nil {
|
|
return nil
|
|
}
|
|
ingress := meshState.VPNFabricIngress
|
|
if ingress == nil {
|
|
ingress = &vpnruntime.FabricClientPacketIngress{}
|
|
}
|
|
ingress.PreventLastRouteWithdrawal = true
|
|
ingress.UpdateRuntime(
|
|
meshState.ProductionForwardTransport,
|
|
meshState.VPNFabricInbox,
|
|
identity.ClusterID,
|
|
identity.NodeID,
|
|
func(vpnConnectionID string) bool {
|
|
return vpnGateway != nil && vpnGateway.IsReadyForConnection(vpnConnectionID)
|
|
},
|
|
func() []mesh.SyntheticRoute {
|
|
return routes
|
|
},
|
|
serviceChannelRecoveryPolicyFingerprint(serviceChannelFeedback),
|
|
vpnruntimeAdaptivePolicy(adaptivePolicy),
|
|
)
|
|
appliedAt := time.Now().UTC()
|
|
ingress.UpdateRouteManager(routeManagerDecisionsFromControlPlane(decisions, remediationCommands), configVersion, appliedAt)
|
|
ingress.UpdateRouteQualityPreferences(routeQualityPreferencesFromServiceChannelFeedback(serviceChannelFeedback, appliedAt), appliedAt)
|
|
return ingress
|
|
}
|
|
|
|
func newVPNFabricQUICTransport(cfg config.Config) *mesh.QUICFabricTransport {
|
|
transport := mesh.NewQUICFabricTransport(nil)
|
|
if cfg.VPNFabricQUICMaxStreamsPerConn > 0 {
|
|
transport.MaxStreamsPerConn = cfg.VPNFabricQUICMaxStreamsPerConn
|
|
}
|
|
if cfg.VPNFabricQUICIdleTTL > 0 {
|
|
transport.IdleTTL = cfg.VPNFabricQUICIdleTTL
|
|
}
|
|
return transport
|
|
}
|
|
|
|
func vpnruntimeAdaptivePolicy(policy *client.FabricServiceChannelAdaptivePolicy) vpnruntime.FabricServiceChannelAdaptivePolicy {
|
|
if policy == nil {
|
|
return vpnruntime.FabricServiceChannelAdaptivePolicy{}
|
|
}
|
|
return vpnruntime.FabricServiceChannelAdaptivePolicy{
|
|
SchemaVersion: policy.SchemaVersion,
|
|
Fingerprint: policy.Fingerprint,
|
|
MaxParallelWindow: policy.MaxParallelWindow,
|
|
BulkPressureChannelThreshold: policy.BulkPressureChannelThreshold,
|
|
QueuePressureHighWatermark: policy.QueuePressureHighWatermark,
|
|
QueuePressureMaxInFlight: policy.QueuePressureMaxInFlight,
|
|
ClassWindows: policy.ClassWindows,
|
|
}
|
|
}
|
|
|
|
func serviceChannelRecoveryPolicyFingerprint(report *client.FabricServiceChannelFeedbackReport) string {
|
|
if report == nil || report.RecoveryPolicy == nil {
|
|
return ""
|
|
}
|
|
return strings.TrimSpace(report.RecoveryPolicy.Fingerprint)
|
|
}
|
|
|
|
func routeQualityPreferencesFromServiceChannelFeedback(report *client.FabricServiceChannelFeedbackReport, observedAt time.Time) []vpnruntime.FabricServiceChannelRouteQualityPreference {
|
|
if report == nil {
|
|
return nil
|
|
}
|
|
now := observedAt.UTC()
|
|
if now.IsZero() {
|
|
now = time.Now().UTC()
|
|
}
|
|
out := make([]vpnruntime.FabricServiceChannelRouteQualityPreference, 0, len(report.Observations))
|
|
for _, observation := range report.Observations {
|
|
effectiveScore := observation.EffectiveScoreAdjustment
|
|
if effectiveScore <= 0 {
|
|
effectiveScore = observation.ScoreAdjustment
|
|
}
|
|
if strings.TrimSpace(observation.RouteID) == "" || strings.TrimSpace(observation.FeedbackStatus) != "healthy" || effectiveScore <= 0 {
|
|
continue
|
|
}
|
|
if !observation.ExpiresAt.IsZero() && !observation.ExpiresAt.After(now) {
|
|
continue
|
|
}
|
|
out = append(out, vpnruntime.FabricServiceChannelRouteQualityPreference{
|
|
RouteID: observation.RouteID,
|
|
FeedbackStatus: observation.FeedbackStatus,
|
|
ScoreAdjustment: effectiveScore,
|
|
RawScoreAdjustment: observation.ScoreAdjustment,
|
|
Reasons: append([]string{}, observation.Reasons...),
|
|
LastSendDurationMs: observation.LastSendDurationMs,
|
|
ObservedAt: observation.ObservedAt.UTC().Format(time.RFC3339Nano),
|
|
ExpiresAt: observation.ExpiresAt.UTC().Format(time.RFC3339Nano),
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func routeManagerDecisionsFromPathDecisions(report *client.RoutePathDecisionReport) []vpnruntime.FabricServiceChannelRouteManagerDecision {
|
|
if report == nil {
|
|
return nil
|
|
}
|
|
out := make([]vpnruntime.FabricServiceChannelRouteManagerDecision, 0, len(report.Decisions))
|
|
for _, decision := range report.Decisions {
|
|
if strings.TrimSpace(decision.RebuildStatus) == "" {
|
|
continue
|
|
}
|
|
out = append(out, vpnruntime.FabricServiceChannelRouteManagerDecision{
|
|
RouteID: decision.RouteID,
|
|
ReplacementRouteID: decision.ReplacementRouteID,
|
|
RebuildRequestID: decision.RebuildRequestID,
|
|
RebuildStatus: decision.RebuildStatus,
|
|
RebuildReason: decision.RebuildReason,
|
|
RebuildAttempt: decision.RebuildAttempt,
|
|
DecisionSource: decision.DecisionSource,
|
|
Generation: decision.Generation,
|
|
EffectiveHops: append([]string{}, decision.EffectiveHops...),
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func routeManagerDecisionsFromControlPlane(report *client.RoutePathDecisionReport, commands []client.FabricServiceChannelRemediationCommand) []vpnruntime.FabricServiceChannelRouteManagerDecision {
|
|
out := routeManagerDecisionsFromPathDecisions(report)
|
|
if len(commands) == 0 {
|
|
return out
|
|
}
|
|
decisionByRequestID := map[string]struct{}{}
|
|
for _, decision := range out {
|
|
if requestID := strings.TrimSpace(decision.RebuildRequestID); requestID != "" {
|
|
decisionByRequestID[requestID] = struct{}{}
|
|
}
|
|
}
|
|
now := time.Now().UTC()
|
|
for _, command := range commands {
|
|
action := strings.TrimSpace(command.Action)
|
|
if action != "prefer_alternate_route" && action != "rebuild_route" {
|
|
continue
|
|
}
|
|
guardStatus := strings.TrimSpace(command.GuardStatus)
|
|
if guardStatus != "" && guardStatus != "allowed" {
|
|
continue
|
|
}
|
|
primaryRouteID := strings.TrimSpace(command.PrimaryRouteID)
|
|
replacementRouteID := strings.TrimSpace(command.ReplacementRouteID)
|
|
if primaryRouteID == "" {
|
|
continue
|
|
}
|
|
if !command.ExpiresAt.IsZero() && !command.ExpiresAt.After(now) {
|
|
continue
|
|
}
|
|
if commandID := strings.TrimSpace(command.CommandID); commandID != "" {
|
|
if _, ok := decisionByRequestID[commandID]; ok {
|
|
continue
|
|
}
|
|
}
|
|
rebuildStatus := "pending_degraded_fallback"
|
|
if action == "prefer_alternate_route" {
|
|
if replacementRouteID == "" || primaryRouteID == replacementRouteID {
|
|
continue
|
|
}
|
|
rebuildStatus = "applied"
|
|
}
|
|
out = append(out, vpnruntime.FabricServiceChannelRouteManagerDecision{
|
|
RouteID: primaryRouteID,
|
|
ReplacementRouteID: replacementRouteID,
|
|
RebuildRequestID: strings.TrimSpace(command.CommandID),
|
|
RebuildStatus: rebuildStatus,
|
|
RebuildReason: firstNonEmpty(command.Reason, "service_channel_remediation_"+action),
|
|
DecisionSource: "service_channel_remediation_command",
|
|
Generation: strings.TrimSpace(command.CommandID),
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func errorString(err error) string {
|
|
if err == nil {
|
|
return ""
|
|
}
|
|
return err.Error()
|
|
}
|
|
|
|
func startSyntheticMeshHTTPServer(ctx context.Context, cfg config.Config, identity state.Identity, handler http.Handler, peerCount int, routeCount int, gateEnabled bool, runtimeEnabled bool) (meshListenerReport, func()) {
|
|
now := time.Now().UTC()
|
|
mode := defaultString(cfg.MeshListenPortMode, "manual")
|
|
baseReport := meshListenerReport{
|
|
SchemaVersion: "c17z21.mesh_listener_report.v1",
|
|
ConfiguredListenAddr: cfg.MeshListenAddr,
|
|
ListenPortMode: mode,
|
|
Status: "disabled",
|
|
InboundReachability: "unavailable",
|
|
ControlPlaneReachable: true,
|
|
OneWayConnectivity: true,
|
|
ObservedAt: now.Format(time.RFC3339Nano),
|
|
}
|
|
if mode == "disabled" || strings.TrimSpace(cfg.MeshListenAddr) == "" {
|
|
if strings.TrimSpace(cfg.MeshListenAddr) == "" {
|
|
baseReport.FailureReason = "listen_addr_empty"
|
|
log.Print("synthetic mesh runtime enabled, but RAP_MESH_LISTEN_ADDR is empty; inbound endpoint disabled")
|
|
} else {
|
|
baseReport.FailureReason = "listen_disabled"
|
|
log.Printf("synthetic mesh endpoint disabled by listen port mode: node_id=%s cluster_id=%s", identity.NodeID, identity.ClusterID)
|
|
}
|
|
return baseReport, func() {}
|
|
}
|
|
|
|
listener, effectiveAddr, autoSelected, bindErr := bindSyntheticMeshListener(cfg)
|
|
if bindErr != nil {
|
|
baseReport.Status = "listen_failed"
|
|
baseReport.FailureReason = "bind_failed"
|
|
baseReport.FailureError = bindErr.Error()
|
|
baseReport.PortConflict = isAddressInUse(bindErr)
|
|
log.Printf("synthetic mesh endpoint unavailable: listen_addr=%s mode=%s node_id=%s cluster_id=%s err=%v", cfg.MeshListenAddr, mode, identity.NodeID, identity.ClusterID, bindErr)
|
|
return baseReport, func() {}
|
|
}
|
|
|
|
report := baseReport
|
|
report.Status = "listening"
|
|
if autoSelected {
|
|
report.Status = "auto_rebound"
|
|
}
|
|
report.EffectiveListenAddr = effectiveAddr
|
|
report.InboundReachability = reachabilityFromConnectivityMode(cfg.MeshConnectivityMode)
|
|
report.OneWayConnectivity = cfg.MeshConnectivityMode == "outbound_only"
|
|
report.AutoPortSelected = autoSelected
|
|
server := &http.Server{
|
|
Addr: effectiveAddr,
|
|
Handler: handler,
|
|
ReadHeaderTimeout: 5 * time.Second,
|
|
}
|
|
go func() {
|
|
log.Printf(
|
|
"synthetic mesh endpoint starting: listen_addr=%s effective_listen_addr=%s mode=%s node_id=%s cluster_id=%s peers=%d routes=%d production_forwarding_gate_enabled=%t production_forwarding_runtime_enabled=%t",
|
|
cfg.MeshListenAddr,
|
|
effectiveAddr,
|
|
mode,
|
|
identity.NodeID,
|
|
identity.ClusterID,
|
|
peerCount,
|
|
routeCount,
|
|
gateEnabled,
|
|
runtimeEnabled,
|
|
)
|
|
if err := server.Serve(listener); err != nil && err != http.ErrServerClosed {
|
|
log.Printf("synthetic mesh endpoint stopped unexpectedly: %v", err)
|
|
}
|
|
}()
|
|
go func() {
|
|
<-ctx.Done()
|
|
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer shutdownCancel()
|
|
if err := server.Shutdown(shutdownCtx); err != nil {
|
|
log.Printf("synthetic mesh endpoint shutdown failed: %v", err)
|
|
}
|
|
}()
|
|
return report, func() {
|
|
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer shutdownCancel()
|
|
_ = server.Shutdown(shutdownCtx)
|
|
}
|
|
}
|
|
|
|
func meshListenerRuntimeConfig(base config.Config, desired *client.MeshListenerConfig) config.Config {
|
|
out := base
|
|
if desired == nil {
|
|
return out
|
|
}
|
|
if desired.ListenAddr != "" {
|
|
out.MeshListenAddr = strings.TrimSpace(desired.ListenAddr)
|
|
}
|
|
if desired.ListenPortMode != "" {
|
|
out.MeshListenPortMode = strings.ToLower(strings.TrimSpace(desired.ListenPortMode))
|
|
}
|
|
if desired.DesiredState != "" && desired.DesiredState != "enabled" {
|
|
out.MeshListenPortMode = "disabled"
|
|
}
|
|
if desired.AutoPortStart > 0 {
|
|
out.MeshListenAutoPortStart = desired.AutoPortStart
|
|
}
|
|
if desired.AutoPortEnd > 0 {
|
|
out.MeshListenAutoPortEnd = desired.AutoPortEnd
|
|
}
|
|
if desired.AdvertiseEndpoint != "" {
|
|
out.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(desired.AdvertiseEndpoint), "/")
|
|
}
|
|
if desired.AdvertiseTransport != "" {
|
|
out.MeshAdvertiseTransport = strings.TrimSpace(desired.AdvertiseTransport)
|
|
}
|
|
if desired.ConnectivityMode != "" {
|
|
out.MeshConnectivityMode = strings.TrimSpace(desired.ConnectivityMode)
|
|
}
|
|
if desired.NATType != "" {
|
|
out.MeshNATType = strings.TrimSpace(desired.NATType)
|
|
}
|
|
if desired.Region != "" {
|
|
out.MeshRegion = strings.TrimSpace(desired.Region)
|
|
}
|
|
out.MeshProductionForwardingEnabled = base.MeshProductionForwardingEnabled || desired.ProductionForwarding
|
|
return out
|
|
}
|
|
|
|
func meshListenerConfigKey(cfg config.Config) string {
|
|
return strings.Join([]string{
|
|
strings.TrimSpace(cfg.MeshListenAddr),
|
|
strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode)),
|
|
fmt.Sprintf("%d", cfg.MeshListenAutoPortStart),
|
|
fmt.Sprintf("%d", cfg.MeshListenAutoPortEnd),
|
|
strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/"),
|
|
strings.TrimSpace(cfg.MeshAdvertiseTransport),
|
|
strings.TrimSpace(cfg.MeshConnectivityMode),
|
|
strings.TrimSpace(cfg.MeshNATType),
|
|
strings.TrimSpace(cfg.MeshRegion),
|
|
fmt.Sprintf("%t", cfg.MeshProductionForwardingEnabled),
|
|
fmt.Sprintf("%t", cfg.VPNFabricSessionTransportEnabled),
|
|
fmt.Sprintf("%t", cfg.MeshQUICFabricEnabled),
|
|
strings.TrimSpace(cfg.MeshQUICFabricListenAddr),
|
|
}, "|")
|
|
}
|
|
|
|
func bindSyntheticMeshListener(cfg config.Config) (net.Listener, string, bool, error) {
|
|
listener, err := net.Listen("tcp", cfg.MeshListenAddr)
|
|
if err == nil {
|
|
return listener, listener.Addr().String(), false, nil
|
|
}
|
|
if cfg.MeshListenPortMode != "auto" {
|
|
return nil, "", false, err
|
|
}
|
|
host, _, splitErr := net.SplitHostPort(cfg.MeshListenAddr)
|
|
if splitErr != nil {
|
|
host = ""
|
|
}
|
|
for port := cfg.MeshListenAutoPortStart; port <= cfg.MeshListenAutoPortEnd; port++ {
|
|
addr := net.JoinHostPort(host, fmt.Sprintf("%d", port))
|
|
listener, listenErr := net.Listen("tcp", addr)
|
|
if listenErr == nil {
|
|
return listener, listener.Addr().String(), true, nil
|
|
}
|
|
}
|
|
return nil, "", false, err
|
|
}
|
|
|
|
func startQUICFabricEndpoint(ctx context.Context, cfg config.Config, identity state.Identity) (*mesh.QUICFabricServer, string, string, error) {
|
|
if !cfg.MeshQUICFabricEnabled {
|
|
return nil, "", "", nil
|
|
}
|
|
if strings.TrimSpace(cfg.MeshQUICFabricListenAddr) == "" {
|
|
return nil, "", "", fmt.Errorf("quic fabric enabled but listen addr is empty")
|
|
}
|
|
tlsConfig, certSHA256, err := quicFabricTLSConfig(identity)
|
|
if err != nil {
|
|
return nil, "", "", err
|
|
}
|
|
server, err := mesh.StartQUICFabricServer(ctx, mesh.QUICFabricServerConfig{
|
|
ListenAddr: cfg.MeshQUICFabricListenAddr,
|
|
TLSConfig: tlsConfig,
|
|
Logger: func(entry mesh.FabricSessionEventLogEntry) {
|
|
payload, err := json.Marshal(entry)
|
|
if err != nil {
|
|
log.Printf("fabric quic event marshal failed: %v", err)
|
|
return
|
|
}
|
|
log.Printf("fabric_quic_event=%s", string(payload))
|
|
},
|
|
})
|
|
if err != nil {
|
|
return nil, "", "", err
|
|
}
|
|
addr := ""
|
|
if server.Addr() != nil {
|
|
addr = server.Addr().String()
|
|
}
|
|
log.Printf("quic fabric endpoint enabled: listen_addr=%s effective_addr=%s node_id=%s cluster_id=%s", cfg.MeshQUICFabricListenAddr, addr, identity.NodeID, identity.ClusterID)
|
|
return server, addr, certSHA256, nil
|
|
}
|
|
|
|
func quicFabricTLSConfig(identity state.Identity) (*tls.Config, string, error) {
|
|
key, err := rsa.GenerateKey(rand.Reader, 2048)
|
|
if err != nil {
|
|
return nil, "", err
|
|
}
|
|
commonName := firstNonEmpty(identity.NodeID, "rap-fabric-node")
|
|
template := x509.Certificate{
|
|
SerialNumber: big.NewInt(time.Now().UnixNano()),
|
|
Subject: pkix.Name{CommonName: commonName},
|
|
NotBefore: time.Now().Add(-time.Minute),
|
|
NotAfter: time.Now().Add(24 * time.Hour),
|
|
KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature,
|
|
ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth},
|
|
DNSNames: []string{commonName, "localhost"},
|
|
}
|
|
certDER, err := x509.CreateCertificate(rand.Reader, &template, &template, &key.PublicKey, key)
|
|
if err != nil {
|
|
return nil, "", err
|
|
}
|
|
sum := sha256.Sum256(certDER)
|
|
return &tls.Config{
|
|
Certificates: []tls.Certificate{{
|
|
Certificate: [][]byte{certDER},
|
|
PrivateKey: key,
|
|
}},
|
|
NextProtos: []string{"rap-fabric-data-session-v1"},
|
|
}, hex.EncodeToString(sum[:]), nil
|
|
}
|
|
|
|
func isAddressInUse(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
text := strings.ToLower(err.Error())
|
|
return strings.Contains(text, "address already in use") || strings.Contains(text, "only one usage of each socket address")
|
|
}
|
|
|
|
func productionEnvelopeObservationSinkFromConfig(cfg config.Config) *mesh.ProductionEnvelopeObservationSink {
|
|
if cfg.MeshProductionObservationSinkCapacity <= 0 {
|
|
return nil
|
|
}
|
|
sink := mesh.NewProductionEnvelopeObservationSink(cfg.MeshProductionObservationSinkCapacity)
|
|
log.Printf("production envelope observation sink enabled: capacity=%d payload_storage=false", sink.Capacity())
|
|
return sink
|
|
}
|
|
|
|
func logProductionObservationSinkMetrics(meshState *syntheticMeshState) {
|
|
if meshState == nil || meshState.ProductionObservationSink == nil {
|
|
return
|
|
}
|
|
metrics := meshState.ProductionObservationSink.Metrics()
|
|
if meshState.LastProductionSinkMetrics != nil && productionObservationSinkMetricsEqual(*meshState.LastProductionSinkMetrics, metrics) {
|
|
return
|
|
}
|
|
meshState.LastProductionSinkMetrics = &metrics
|
|
log.Printf(
|
|
"production envelope observation sink metrics: capacity=%d current_depth=%d accepted_total=%d dropped_oldest=%d payload_storage=false",
|
|
metrics.Capacity,
|
|
metrics.CurrentDepth,
|
|
metrics.AcceptedTotal,
|
|
metrics.DroppedOldest,
|
|
)
|
|
}
|
|
|
|
func productionObservationSinkMetricsEqual(a, b mesh.ProductionEnvelopeObservationSinkMetrics) bool {
|
|
return a.Capacity == b.Capacity &&
|
|
a.CurrentDepth == b.CurrentDepth &&
|
|
a.AcceptedTotal == b.AcceptedTotal &&
|
|
a.DroppedOldest == b.DroppedOldest
|
|
}
|
|
|
|
func loadSyntheticMeshConfig(ctx context.Context, cfg config.Config, identity state.Identity, api *client.Client) (loadedSyntheticMeshConfig, error) {
|
|
local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID}
|
|
if cfg.MeshSyntheticConfigPath != "" {
|
|
scoped, err := mesh.LoadScopedSyntheticConfig(cfg.MeshSyntheticConfigPath, local)
|
|
if err != nil {
|
|
return loadedSyntheticMeshConfig{}, err
|
|
}
|
|
return loadedSyntheticMeshConfig{
|
|
PeerEndpoints: scoped.PeerEndpoints,
|
|
PeerEndpointCandidates: scoped.PeerEndpointCandidates,
|
|
PeerEndpointObservations: scoped.PeerEndpointObservations,
|
|
PeerDirectory: scoped.PeerDirectory,
|
|
RecoverySeeds: scoped.RecoverySeeds,
|
|
RendezvousLeases: scoped.RendezvousLeases,
|
|
RoutePathDecisions: nil,
|
|
Routes: scoped.Routes,
|
|
Source: "scoped_config",
|
|
ConfigVersion: scoped.ConfigVersion,
|
|
PeerDirectoryVersion: scoped.PeerDirectoryVersion,
|
|
PolicyVersion: scoped.PolicyVersion,
|
|
ProductionForwarding: false,
|
|
}, nil
|
|
}
|
|
if api != nil {
|
|
remote, err := api.SyntheticMeshConfig(ctx, local.ClusterID, local.NodeID)
|
|
if err == nil {
|
|
if verifyErr := verifyControlPlaneSyntheticMeshConfig(remote, identity, cfg); verifyErr != nil {
|
|
return loadedSyntheticMeshConfig{}, verifyErr
|
|
}
|
|
}
|
|
if err == nil && remote.Enabled {
|
|
return loadedSyntheticMeshConfig{
|
|
PeerEndpoints: remote.PeerEndpoints,
|
|
PeerEndpointCandidates: peerEndpointCandidatesFromControlPlane(remote.PeerEndpointCandidates),
|
|
PeerEndpointObservations: endpointCandidateObservationsFromControlPlane(remote.PeerEndpointObservations),
|
|
PeerDirectory: peerDirectoryFromControlPlane(remote.PeerDirectory),
|
|
RecoverySeeds: recoverySeedsFromControlPlane(remote.RecoverySeeds),
|
|
RendezvousLeases: rendezvousLeasesFromControlPlane(remote.RendezvousLeases),
|
|
RoutePathDecisions: remote.RoutePathDecisions,
|
|
ServiceChannelFeedback: remote.ServiceChannelFeedback,
|
|
ServiceChannelAdaptivePolicy: remote.ServiceChannelAdaptivePolicy,
|
|
ServiceChannelRemediationCommands: append([]client.FabricServiceChannelRemediationCommand{}, remote.ServiceChannelRemediationCommands...),
|
|
MeshListener: remote.MeshListener,
|
|
Routes: syntheticRoutesFromControlPlane(remote.Routes),
|
|
Source: "control_plane",
|
|
ConfigVersion: remote.ConfigVersion,
|
|
PeerDirectoryVersion: remote.PeerDirectoryVersion,
|
|
PolicyVersion: remote.PolicyVersion,
|
|
ProductionForwarding: remote.ProductionForwarding,
|
|
}, nil
|
|
}
|
|
if err != nil {
|
|
log.Printf("control-plane synthetic mesh config unavailable, falling back to debug JSON: %v", err)
|
|
}
|
|
}
|
|
peerEndpoints, err := parseMeshPeerEndpoints(cfg.MeshPeerEndpointsJSON)
|
|
if err != nil {
|
|
return loadedSyntheticMeshConfig{}, err
|
|
}
|
|
routes, err := parseMeshSyntheticRoutes(cfg.MeshSyntheticRoutesJSON)
|
|
if err != nil {
|
|
return loadedSyntheticMeshConfig{}, err
|
|
}
|
|
return loadedSyntheticMeshConfig{
|
|
PeerEndpoints: peerEndpoints,
|
|
Routes: routes,
|
|
Source: "debug_json",
|
|
}, nil
|
|
}
|
|
|
|
type controlPlaneMeshConfigAuthorityPayload struct {
|
|
SchemaVersion string `json:"schema_version"`
|
|
ClusterID string `json:"cluster_id"`
|
|
LocalNodeID string `json:"local_node_id"`
|
|
ConfigVersion string `json:"config_version"`
|
|
ConfigSHA256 string `json:"config_sha256"`
|
|
IssuedAt time.Time `json:"issued_at"`
|
|
ExpiresAt time.Time `json:"expires_at"`
|
|
ControlPlaneOnly bool `json:"control_plane_only"`
|
|
ProductionForwarding bool `json:"production_forwarding"`
|
|
}
|
|
|
|
func verifyControlPlaneSyntheticMeshConfig(remote client.SyntheticMeshConfig, identity state.Identity, cfg config.Config) error {
|
|
signaturePresent := remote.ClusterAuthority != nil || len(remote.AuthorityPayload) > 0 || remote.AuthoritySignature != nil
|
|
if !remote.AuthorityRequired && !signaturePresent {
|
|
return nil
|
|
}
|
|
if remote.ClusterAuthority == nil {
|
|
return fmt.Errorf("control-plane synthetic mesh config requires cluster authority")
|
|
}
|
|
if remote.AuthoritySignature == nil || rawMessageEmpty(remote.AuthorityPayload) {
|
|
return fmt.Errorf("control-plane synthetic mesh config requires authority payload and signature")
|
|
}
|
|
if remote.ClusterAuthority.SchemaVersion != authority.AuthoritySchemaVersion {
|
|
return fmt.Errorf("control-plane synthetic mesh config authority schema mismatch")
|
|
}
|
|
if remote.ClusterAuthority.ClusterID != identity.ClusterID || remote.ClusterAuthority.ClusterID != remote.ClusterID {
|
|
return fmt.Errorf("control-plane synthetic mesh config authority cluster mismatch")
|
|
}
|
|
if remote.ClusterAuthority.KeyAlgorithm != authority.AlgorithmEd25519 {
|
|
return fmt.Errorf("control-plane synthetic mesh config authority algorithm mismatch")
|
|
}
|
|
if remote.AuthoritySignature.KeyFingerprint != remote.ClusterAuthority.PublicKeyFingerprint {
|
|
return fmt.Errorf("control-plane synthetic mesh config signature fingerprint mismatch")
|
|
}
|
|
if pinned := firstNonEmpty(identity.ClusterAuthorityFingerprint, cfg.ClusterAuthorityFingerprint); pinned != "" && pinned != remote.ClusterAuthority.PublicKeyFingerprint {
|
|
return fmt.Errorf("control-plane synthetic mesh config authority fingerprint mismatch")
|
|
}
|
|
if pinned := firstNonEmpty(identity.ClusterAuthorityPublicKey, cfg.ClusterAuthorityPublicKey); pinned != "" && pinned != remote.ClusterAuthority.PublicKey {
|
|
return fmt.Errorf("control-plane synthetic mesh config authority public key mismatch")
|
|
}
|
|
signature := authority.Signature{
|
|
SchemaVersion: remote.AuthoritySignature.SchemaVersion,
|
|
Algorithm: remote.AuthoritySignature.Algorithm,
|
|
KeyFingerprint: remote.AuthoritySignature.KeyFingerprint,
|
|
Signature: remote.AuthoritySignature.Signature,
|
|
}
|
|
if err := authority.VerifyRaw(remote.ClusterAuthority.PublicKey, remote.AuthorityPayload, signature); err != nil {
|
|
return fmt.Errorf("verify control-plane synthetic mesh config authority signature: %w", err)
|
|
}
|
|
var payload controlPlaneMeshConfigAuthorityPayload
|
|
if err := json.Unmarshal(remote.AuthorityPayload, &payload); err != nil {
|
|
return fmt.Errorf("decode control-plane synthetic mesh config authority payload: %w", err)
|
|
}
|
|
if payload.SchemaVersion != "rap.cluster.mesh_config_snapshot.v1" {
|
|
return fmt.Errorf("control-plane synthetic mesh config authority payload schema mismatch")
|
|
}
|
|
if payload.ClusterID != identity.ClusterID || payload.ClusterID != remote.ClusterID {
|
|
return fmt.Errorf("control-plane synthetic mesh config authority payload cluster mismatch")
|
|
}
|
|
if payload.LocalNodeID != identity.NodeID || payload.LocalNodeID != remote.LocalNodeID {
|
|
return fmt.Errorf("control-plane synthetic mesh config authority payload node mismatch")
|
|
}
|
|
if payload.ConfigVersion != remote.ConfigVersion {
|
|
return fmt.Errorf("control-plane synthetic mesh config authority payload version mismatch")
|
|
}
|
|
if payload.ControlPlaneOnly == payload.ProductionForwarding {
|
|
return fmt.Errorf("synthetic mesh config authority payload control-plane/production forwarding flags mismatch")
|
|
}
|
|
if payload.ProductionForwarding != remote.ProductionForwarding {
|
|
return fmt.Errorf("synthetic mesh config authority payload production forwarding mismatch")
|
|
}
|
|
if !payload.ExpiresAt.IsZero() && !payload.ExpiresAt.After(time.Now().UTC()) {
|
|
return fmt.Errorf("control-plane synthetic mesh config authority payload expired")
|
|
}
|
|
configHash, err := syntheticMeshConfigAuthorityHash(remote)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if payload.ConfigSHA256 != configHash {
|
|
return fmt.Errorf("control-plane synthetic mesh config authority payload hash mismatch")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func syntheticMeshConfigAuthorityHash(remote client.SyntheticMeshConfig) (string, error) {
|
|
if !rawMessageEmpty(remote.Raw) {
|
|
var unsigned map[string]json.RawMessage
|
|
if err := json.Unmarshal(remote.Raw, &unsigned); err != nil {
|
|
return "", fmt.Errorf("decode raw control-plane synthetic mesh config for authority hash: %w", err)
|
|
}
|
|
delete(unsigned, "authority_payload")
|
|
delete(unsigned, "authority_signature")
|
|
raw, err := json.Marshal(unsigned)
|
|
if err != nil {
|
|
return "", fmt.Errorf("marshal raw control-plane synthetic mesh config for authority hash: %w", err)
|
|
}
|
|
hash, err := authority.HashRaw(raw)
|
|
if err != nil {
|
|
return "", fmt.Errorf("hash raw control-plane synthetic mesh config authority payload: %w", err)
|
|
}
|
|
return hash, nil
|
|
}
|
|
unsigned := remote
|
|
unsigned.Raw = nil
|
|
unsigned.AuthorityPayload = nil
|
|
unsigned.AuthoritySignature = nil
|
|
raw, err := json.Marshal(unsigned)
|
|
if err != nil {
|
|
return "", fmt.Errorf("marshal control-plane synthetic mesh config for authority hash: %w", err)
|
|
}
|
|
hash, err := authority.HashRaw(raw)
|
|
if err != nil {
|
|
return "", fmt.Errorf("hash control-plane synthetic mesh config authority payload: %w", err)
|
|
}
|
|
return hash, nil
|
|
}
|
|
|
|
func rawMessageEmpty(raw json.RawMessage) bool {
|
|
value := strings.TrimSpace(string(raw))
|
|
return value == "" || value == "{}" || value == "null"
|
|
}
|
|
|
|
func firstNonEmpty(values ...string) string {
|
|
for _, value := range values {
|
|
if strings.TrimSpace(value) != "" {
|
|
return strings.TrimSpace(value)
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
type meshRendezvousLeasePosture struct {
|
|
RefreshNeeded bool
|
|
Reason string
|
|
RefreshNeededCount int
|
|
RenewalNeededCount int
|
|
ExpiredCount int
|
|
InvalidCount int
|
|
StaleRelayCount int
|
|
}
|
|
|
|
func refreshRendezvousLeasesIfNeeded(ctx context.Context, cfg config.Config, identity state.Identity, api *client.Client, meshState *syntheticMeshState, observedAt time.Time) error {
|
|
if meshState == nil || meshState.PeerCache == nil {
|
|
return nil
|
|
}
|
|
observedAt = observedAt.UTC()
|
|
posture := meshRendezvousLeasePostureForState(meshState, identity, observedAt)
|
|
if !posture.RefreshNeeded {
|
|
return nil
|
|
}
|
|
if meshState.LastLeaseRefresh != nil && meshState.LastLeaseRefresh.AttemptedAt.Add(meshRendezvousLeaseRefreshBackoff).After(observedAt) {
|
|
return nil
|
|
}
|
|
|
|
refresh := &meshRendezvousLeaseRefreshState{
|
|
Status: "attempted",
|
|
Reason: posture.Reason,
|
|
AttemptedAt: observedAt,
|
|
PreviousLeaseCount: len(meshState.RendezvousLeases),
|
|
RefreshNeededCount: posture.RefreshNeededCount,
|
|
RenewalNeededCount: posture.RenewalNeededCount,
|
|
ExpiredCount: posture.ExpiredCount,
|
|
StaleRelayCount: posture.StaleRelayCount,
|
|
ConfigVersion: meshState.ConfigVersion,
|
|
}
|
|
meshState.LastLeaseRefresh = refresh
|
|
meshState.LeaseRefreshAttempts++
|
|
|
|
if api == nil || meshState.Source != "control_plane" || cfg.MeshSyntheticConfigPath != "" {
|
|
refresh.Status = "unsupported"
|
|
refresh.Error = "control_plane_synthetic_config_required"
|
|
refresh.CompletedAt = observedAt
|
|
meshState.LeaseRefreshFailures++
|
|
return nil
|
|
}
|
|
|
|
local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID}
|
|
loadedConfig, err := loadSyntheticMeshConfig(ctx, cfg, identity, api)
|
|
completedAt := time.Now().UTC()
|
|
refresh.CompletedAt = completedAt
|
|
if err != nil {
|
|
refresh.Status = "failed"
|
|
refresh.Error = err.Error()
|
|
meshState.LeaseRefreshFailures++
|
|
return err
|
|
}
|
|
applyRefreshedSyntheticMeshConfig(ctx, cfg, identity, meshState, loadedConfig, local, cfg.MeshRegion, completedAt)
|
|
refresh.Status = "succeeded"
|
|
refresh.RefreshedLeaseCount = len(loadedConfig.RendezvousLeases)
|
|
refresh.ConfigVersion = loadedConfig.ConfigVersion
|
|
meshState.LeaseRefreshSuccesses++
|
|
log.Printf(
|
|
"mesh rendezvous lease refresh succeeded: reason=%s previous_leases=%d refreshed_leases=%d config_version=%s",
|
|
refresh.Reason,
|
|
refresh.PreviousLeaseCount,
|
|
refresh.RefreshedLeaseCount,
|
|
refresh.ConfigVersion,
|
|
)
|
|
return nil
|
|
}
|
|
|
|
func refreshSyntheticMeshConfigIfDue(ctx context.Context, cfg config.Config, identity state.Identity, api *client.Client, meshState *syntheticMeshState, observedAt time.Time) error {
|
|
if meshState == nil || meshState.PeerCache == nil {
|
|
return nil
|
|
}
|
|
observedAt = observedAt.UTC()
|
|
if !meshState.LastConfigRefreshAt.IsZero() && meshState.LastConfigRefreshAt.Add(meshSyntheticConfigRefreshInterval).After(observedAt) {
|
|
return nil
|
|
}
|
|
if api == nil || cfg.MeshSyntheticConfigPath != "" {
|
|
meshState.LastConfigRefreshAt = observedAt
|
|
return nil
|
|
}
|
|
if identity.NodeID == "" || identity.ClusterID == "" {
|
|
return nil
|
|
}
|
|
local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID}
|
|
loadedConfig, err := loadSyntheticMeshConfig(ctx, cfg, identity, api)
|
|
completedAt := time.Now().UTC()
|
|
if err != nil {
|
|
meshState.LastConfigRefreshAt = observedAt
|
|
return err
|
|
}
|
|
previousVersion := meshState.ConfigVersion
|
|
applyRefreshedSyntheticMeshConfig(ctx, cfg, identity, meshState, loadedConfig, local, cfg.MeshRegion, completedAt)
|
|
log.Printf(
|
|
"mesh synthetic config refreshed: previous_config_version=%s refreshed_config_version=%s route_health_routes=%d",
|
|
previousVersion,
|
|
loadedConfig.ConfigVersion,
|
|
len(meshState.RouteHealthRoutes),
|
|
)
|
|
return nil
|
|
}
|
|
|
|
func refreshSyntheticMeshConfigForRouteHealthFeedback(ctx context.Context, cfg config.Config, identity state.Identity, api *client.Client, meshState *syntheticMeshState, trigger meshRouteHealthFeedbackTrigger, observedAt time.Time) error {
|
|
if meshState == nil || meshState.PeerCache == nil {
|
|
return nil
|
|
}
|
|
observedAt = observedAt.UTC()
|
|
if trigger.RouteID == "" {
|
|
return nil
|
|
}
|
|
if meshState.LastRouteHealthRefresh != nil && meshState.LastRouteHealthRefresh.AttemptedAt.Add(meshRouteHealthFeedbackRefreshBackoff).After(observedAt) {
|
|
meshState.RouteHealthRefreshSuppressed++
|
|
return nil
|
|
}
|
|
|
|
refresh := &meshRouteHealthFeedbackRefreshState{
|
|
Status: "attempted",
|
|
Reason: trigger.Reason,
|
|
AttemptedAt: observedAt,
|
|
RouteID: trigger.RouteID,
|
|
PeerNodeID: trigger.PeerNodeID,
|
|
SelectedRelayID: trigger.SelectedRelayID,
|
|
LinkStatus: trigger.LinkStatus,
|
|
FailureReason: trigger.FailureReason,
|
|
DriftDetected: trigger.DriftDetected,
|
|
PreviousConfigVersion: meshState.ConfigVersion,
|
|
PreviousRouteHealthRouteCount: len(meshState.RouteHealthRoutes),
|
|
}
|
|
meshState.LastRouteHealthRefresh = refresh
|
|
meshState.RouteHealthRefreshAttempts++
|
|
|
|
if api == nil || meshState.Source != "control_plane" || cfg.MeshSyntheticConfigPath != "" {
|
|
refresh.Status = "unsupported"
|
|
refresh.Error = "control_plane_synthetic_config_required"
|
|
refresh.CompletedAt = observedAt
|
|
meshState.RouteHealthRefreshFailures++
|
|
return nil
|
|
}
|
|
if identity.NodeID == "" || identity.ClusterID == "" {
|
|
refresh.Status = "unsupported"
|
|
refresh.Error = "approved_identity_required"
|
|
refresh.CompletedAt = observedAt
|
|
meshState.RouteHealthRefreshFailures++
|
|
return nil
|
|
}
|
|
|
|
local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID}
|
|
loadedConfig, err := loadSyntheticMeshConfig(ctx, cfg, identity, api)
|
|
completedAt := time.Now().UTC()
|
|
refresh.CompletedAt = completedAt
|
|
if err != nil {
|
|
refresh.Status = "failed"
|
|
refresh.Error = err.Error()
|
|
meshState.RouteHealthRefreshFailures++
|
|
return err
|
|
}
|
|
applyRefreshedSyntheticMeshConfig(ctx, cfg, identity, meshState, loadedConfig, local, cfg.MeshRegion, completedAt)
|
|
refresh.Status = "succeeded"
|
|
refresh.RefreshedConfigVersion = loadedConfig.ConfigVersion
|
|
refresh.RefreshedRouteHealthRouteCount = len(meshState.RouteHealthRoutes)
|
|
meshState.RouteHealthRefreshSuccesses++
|
|
log.Printf(
|
|
"mesh route-health feedback refresh succeeded: reason=%s route_id=%s previous_config_version=%s refreshed_config_version=%s route_health_routes=%d",
|
|
refresh.Reason,
|
|
refresh.RouteID,
|
|
refresh.PreviousConfigVersion,
|
|
refresh.RefreshedConfigVersion,
|
|
refresh.RefreshedRouteHealthRouteCount,
|
|
)
|
|
return nil
|
|
}
|
|
|
|
func applyRefreshedSyntheticMeshConfig(ctx context.Context, cfg config.Config, identity state.Identity, meshState *syntheticMeshState, loadedConfig loadedSyntheticMeshConfig, local mesh.PeerIdentity, preferredRegion string, observedAt time.Time) {
|
|
routeHealthRoutes := routeHealthRoutesFromPathDecisions(loadedConfig.Routes, loadedConfig.RoutePathDecisions)
|
|
peerCache := mesh.NewPeerCache(mesh.PeerCacheConfig{
|
|
Local: local,
|
|
PeerEndpoints: loadedConfig.PeerEndpoints,
|
|
PeerEndpointCandidates: loadedConfig.PeerEndpointCandidates,
|
|
PeerEndpointObservations: loadedConfig.PeerEndpointObservations,
|
|
PeerDirectory: loadedConfig.PeerDirectory,
|
|
RecoverySeeds: loadedConfig.RecoverySeeds,
|
|
RendezvousLeases: loadedConfig.RendezvousLeases,
|
|
Routes: loadedConfig.Routes,
|
|
WarmPeerLimit: mesh.DefaultWarmPeerLimit,
|
|
PreferredRegion: preferredRegion,
|
|
Now: observedAt,
|
|
})
|
|
if meshState.PeerConnections == nil {
|
|
meshState.PeerConnections = mesh.NewPeerConnectionTracker(peerCache.Snapshot(), observedAt)
|
|
}
|
|
peerConnectionSnapshot := meshState.PeerConnections.Snapshot()
|
|
peerRecoveryPlan := mesh.PlanPeerRecovery(mesh.PeerRecoveryPlanConfig{
|
|
PeerCache: peerCache.Snapshot(),
|
|
Connections: peerConnectionSnapshot,
|
|
TargetReadyPeers: mesh.DefaultStablePeerTarget,
|
|
MaxProbeCandidates: mesh.DefaultRecoveryProbeLimit,
|
|
Now: observedAt,
|
|
})
|
|
peerConnectionIntentPlan := mesh.PlanPeerConnectionIntents(mesh.PeerConnectionIntentPlanConfig{
|
|
PeerCache: peerCache.Snapshot(),
|
|
RecoveryPlan: peerRecoveryPlan,
|
|
RendezvousLeases: loadedConfig.RendezvousLeases,
|
|
Now: observedAt,
|
|
})
|
|
if meshState.PeerConnectionManager == nil {
|
|
meshState.PeerConnectionManager = mesh.NewPeerConnectionManager(mesh.PeerConnectionManagerConfig{
|
|
Local: local,
|
|
PeerCache: peerCache,
|
|
Tracker: meshState.PeerConnections,
|
|
RendezvousLeases: loadedConfig.RendezvousLeases,
|
|
})
|
|
} else {
|
|
meshState.PeerConnectionManager.UpdatePeerConfig(peerCache, loadedConfig.RendezvousLeases)
|
|
}
|
|
if meshState.Runtime != nil {
|
|
meshState.Runtime.UpdateConfig(loadedConfig.Routes, mesh.NewHTTPPeerTransport(loadedConfig.PeerEndpoints))
|
|
meshState.Runtime.UpdateRouteHealthConfig(routeHealthRoutes)
|
|
}
|
|
if meshState.RouteGenerationTracker == nil {
|
|
meshState.RouteGenerationTracker = newMeshRouteGenerationTracker(loadedConfig.RoutePathDecisions, observedAt)
|
|
} else {
|
|
meshState.RouteGenerationTracker.Apply(loadedConfig.RoutePathDecisions, observedAt)
|
|
}
|
|
productionForwardingEnabled := cfg.MeshProductionForwardingEnabled || loadedConfig.ProductionForwarding
|
|
meshState.ProductionForwardingEnabled = productionForwardingEnabled
|
|
if (!sameStringMap(meshState.PeerEndpoints, loadedConfig.PeerEndpoints) || !samePeerEndpointCandidatesMap(meshState.PeerEndpointCandidates, loadedConfig.PeerEndpointCandidates)) && meshState.VPNFabricSessionPeers != nil {
|
|
_ = meshState.VPNFabricSessionPeers.Close()
|
|
if meshState.VPNFabricQUICTransport != nil {
|
|
_ = meshState.VPNFabricQUICTransport.Close()
|
|
}
|
|
meshState.VPNFabricSessionPeers = mesh.NewFabricSessionPeerManager()
|
|
meshState.VPNFabricTransport = mesh.NewWebSocketFabricTransport(meshState.VPNFabricSessionPeers)
|
|
meshState.VPNFabricQUICTransport = newVPNFabricQUICTransport(cfg)
|
|
}
|
|
if meshState.VPNFabricSessionPeers == nil {
|
|
meshState.VPNFabricSessionPeers = mesh.NewFabricSessionPeerManager()
|
|
}
|
|
if meshState.VPNFabricTransport == nil {
|
|
meshState.VPNFabricTransport = mesh.NewWebSocketFabricTransport(meshState.VPNFabricSessionPeers)
|
|
}
|
|
if meshState.VPNFabricQUICTransport == nil {
|
|
meshState.VPNFabricQUICTransport = newVPNFabricQUICTransport(cfg)
|
|
} else if cfg.VPNFabricQUICMaxStreamsPerConn > 0 {
|
|
meshState.VPNFabricQUICTransport.MaxStreamsPerConn = cfg.VPNFabricQUICMaxStreamsPerConn
|
|
}
|
|
if meshState.VPNFabricQUICTransport != nil && cfg.VPNFabricQUICIdleTTL > 0 {
|
|
meshState.VPNFabricQUICTransport.IdleTTL = cfg.VPNFabricQUICIdleTTL
|
|
}
|
|
if meshState.VPNFabricSessionDialStats == nil {
|
|
meshState.VPNFabricSessionDialStats = newVPNFabricSessionDialStats()
|
|
}
|
|
if meshState.VPNFabricEndpointObservations == nil {
|
|
meshState.VPNFabricEndpointObservations = newVPNFabricEndpointObservationStore(identity.NodeID)
|
|
}
|
|
meshState.PeerEndpoints = copyStringMap(loadedConfig.PeerEndpoints)
|
|
meshState.PeerEndpointCandidates = copyPeerEndpointCandidatesMap(loadedConfig.PeerEndpointCandidates)
|
|
meshState.PeerEndpointObservations = copyEndpointCandidateObservations(loadedConfig.PeerEndpointObservations)
|
|
if productionForwardingEnabled {
|
|
meshState.ProductionForwardTransport = mesh.NewHTTPProductionForwardTransport(loadedConfig.PeerEndpoints)
|
|
} else {
|
|
meshState.ProductionForwardTransport = nil
|
|
}
|
|
vpnFabricIngress := newVPNFabricIngress(meshState, identity, loadedConfig.Routes, loadedConfig.RoutePathDecisions, loadedConfig.ServiceChannelRemediationCommands, loadedConfig.ServiceChannelFeedback, loadedConfig.ServiceChannelAdaptivePolicy, loadedConfig.ConfigVersion, meshState.VPNGateway)
|
|
meshState.VPNFabricIngress = vpnFabricIngress
|
|
if meshState.ServiceChannelAccessStats == nil {
|
|
meshState.ServiceChannelAccessStats = newFabricServiceChannelAccessStats()
|
|
}
|
|
if meshState.RemoteWorkspaceFrameSink == nil {
|
|
meshState.RemoteWorkspaceFrameSink = mesh.NewRemoteWorkspaceFrameProbeSink()
|
|
}
|
|
nextListenerHandler := mesh.Server{
|
|
Local: local,
|
|
SyntheticRuntime: meshState.Runtime,
|
|
ProductionForwardingEnabled: productionForwardingEnabled,
|
|
ProductionEnvelopeDelivery: func() mesh.ProductionEnvelopeDelivery {
|
|
if meshState.VPNFabricInbox == nil {
|
|
return nil
|
|
}
|
|
return meshState.VPNFabricInbox.DeliverProductionEnvelope
|
|
}(),
|
|
ProductionForwardTransport: meshState.ProductionForwardTransport,
|
|
ProductionForwardLogger: func(entry mesh.ProductionForwardLogEntry) {
|
|
payload, err := json.Marshal(entry)
|
|
if err != nil {
|
|
log.Printf("mesh production forward event marshal failed: %v", err)
|
|
return
|
|
}
|
|
log.Printf("mesh_production_forward_event=%s", string(payload))
|
|
},
|
|
FabricServiceChannelLogger: func(entry mesh.FabricServiceChannelAccessLogEntry) {
|
|
meshState.ServiceChannelAccessStats.Observe(entry)
|
|
payload, err := json.Marshal(entry)
|
|
if err != nil {
|
|
log.Printf("fabric service channel access event marshal failed: %v", err)
|
|
return
|
|
}
|
|
log.Printf("fabric_service_channel_access_event=%s", string(payload))
|
|
},
|
|
FabricSessionEnabled: cfg.MeshFabricSessionEnabled,
|
|
FabricSessionLogger: func(entry mesh.FabricSessionEventLogEntry) {
|
|
payload, err := json.Marshal(entry)
|
|
if err != nil {
|
|
log.Printf("fabric session event marshal failed: %v", err)
|
|
return
|
|
}
|
|
log.Printf("fabric_session_event=%s", string(payload))
|
|
},
|
|
RemoteWorkspaceFrameSink: meshState.RemoteWorkspaceFrameSink,
|
|
ProductionRoutes: loadedConfig.Routes,
|
|
VPNPacketIngress: vpnFabricIngress,
|
|
BackendProxyBaseURL: cfg.BackendURL,
|
|
ClusterAuthorityPublicKey: firstNonEmpty(identity.ClusterAuthorityPublicKey, cfg.ClusterAuthorityPublicKey),
|
|
}.Handler()
|
|
if meshState.ListenerHandler == nil {
|
|
meshState.ListenerHandler = newDynamicHTTPHandler(nextListenerHandler)
|
|
} else {
|
|
meshState.ListenerHandler.Update(nextListenerHandler)
|
|
}
|
|
applyQUICFabricConfigIfChanged(ctx, cfg, identity, meshState)
|
|
applyMeshListenerConfigIfChanged(ctx, cfg, identity, meshState, loadedConfig, observedAt)
|
|
meshState.Routes = loadedConfig.Routes
|
|
meshState.RouteHealthRoutes = routeHealthRoutes
|
|
meshState.Source = loadedConfig.Source
|
|
meshState.PeerCache = peerCache
|
|
meshState.RendezvousLeases = loadedConfig.RendezvousLeases
|
|
meshState.RoutePathDecisions = loadedConfig.RoutePathDecisions
|
|
meshState.ServiceChannelFeedback = loadedConfig.ServiceChannelFeedback
|
|
meshState.ServiceChannelRemediationCommands = append([]client.FabricServiceChannelRemediationCommand{}, loadedConfig.ServiceChannelRemediationCommands...)
|
|
meshState.ConfigVersion = loadedConfig.ConfigVersion
|
|
meshState.PeerDirectoryVersion = loadedConfig.PeerDirectoryVersion
|
|
meshState.PolicyVersion = loadedConfig.PolicyVersion
|
|
meshState.ConfigLoadError = ""
|
|
meshState.LastConfigRefreshAt = observedAt
|
|
meshState.LastPeerRecoveryPlan = &peerRecoveryPlan
|
|
meshState.LastPeerConnectionIntent = &peerConnectionIntentPlan
|
|
}
|
|
|
|
func applyMeshListenerConfigIfChanged(ctx context.Context, base config.Config, identity state.Identity, meshState *syntheticMeshState, loadedConfig loadedSyntheticMeshConfig, observedAt time.Time) {
|
|
if meshState == nil || meshState.ListenerHandler == nil {
|
|
return
|
|
}
|
|
nextCfg := meshListenerRuntimeConfig(base, loadedConfig.MeshListener)
|
|
nextKey := meshListenerConfigKey(nextCfg)
|
|
if nextKey == meshState.ListenerConfigKey {
|
|
return
|
|
}
|
|
if meshState.StopListener != nil {
|
|
meshState.StopListener()
|
|
}
|
|
gateEnabled, runtimeEnabled := productionForwardingLogState(nextCfg, loadedConfig.ProductionForwarding)
|
|
report, stop := startSyntheticMeshHTTPServer(ctx, nextCfg, identity, meshState.ListenerHandler, len(loadedConfig.PeerEndpoints), len(loadedConfig.Routes), gateEnabled, runtimeEnabled)
|
|
meshState.ListenerReport = report
|
|
meshState.ListenerConfigKey = nextKey
|
|
meshState.ListenerRuntimeConfig = nextCfg
|
|
meshState.StopListener = stop
|
|
log.Printf(
|
|
"mesh listener config applied: mode=%s listen_addr=%s status=%s config_version=%s observed_at=%s",
|
|
nextCfg.MeshListenPortMode,
|
|
nextCfg.MeshListenAddr,
|
|
report.Status,
|
|
loadedConfig.ConfigVersion,
|
|
observedAt.Format(time.RFC3339Nano),
|
|
)
|
|
}
|
|
|
|
func applyQUICFabricConfigIfChanged(ctx context.Context, cfg config.Config, identity state.Identity, meshState *syntheticMeshState) {
|
|
if meshState == nil {
|
|
return
|
|
}
|
|
desiredAddr := strings.TrimSpace(cfg.MeshQUICFabricListenAddr)
|
|
if meshState.QUICFabricServer != nil && (!cfg.MeshQUICFabricEnabled || meshState.QUICFabricListenAddr != desiredAddr) {
|
|
_ = meshState.QUICFabricServer.Close()
|
|
meshState.QUICFabricServer = nil
|
|
meshState.QUICFabricListenAddr = ""
|
|
meshState.QUICFabricCertSHA256 = ""
|
|
}
|
|
if !cfg.MeshQUICFabricEnabled {
|
|
meshState.QUICFabricError = ""
|
|
meshState.QUICFabricCertSHA256 = ""
|
|
return
|
|
}
|
|
if meshState.QUICFabricServer != nil {
|
|
return
|
|
}
|
|
server, addr, certSHA256, err := startQUICFabricEndpoint(ctx, cfg, identity)
|
|
meshState.QUICFabricServer = server
|
|
meshState.QUICFabricListenAddr = addr
|
|
meshState.QUICFabricCertSHA256 = certSHA256
|
|
meshState.QUICFabricError = errorString(err)
|
|
if err != nil {
|
|
log.Printf("quic fabric endpoint unavailable: listen_addr=%s node_id=%s cluster_id=%s err=%v", cfg.MeshQUICFabricListenAddr, identity.NodeID, identity.ClusterID, err)
|
|
}
|
|
}
|
|
|
|
func meshRendezvousLeasePostureForState(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) meshRendezvousLeasePosture {
|
|
posture := meshRendezvousLeasePosture{}
|
|
if meshState == nil {
|
|
return posture
|
|
}
|
|
connectionByPeer := meshRendezvousConnectionsByPeer(meshState)
|
|
for _, lease := range meshState.RendezvousLeases {
|
|
valid := meshRendezvousLeaseBaseValid(lease)
|
|
expired := valid && !lease.ExpiresAt.After(observedAt)
|
|
usable := valid && !expired
|
|
renewalNeeded := meshRendezvousLeaseRenewalNeeded(lease, observedAt, usable)
|
|
staleRelay := usable && meshRendezvousLeaseStaleRelay(lease, connectionByPeer[lease.PeerNodeID])
|
|
switch {
|
|
case !valid:
|
|
posture.InvalidCount++
|
|
case expired:
|
|
posture.ExpiredCount++
|
|
case renewalNeeded:
|
|
posture.RenewalNeededCount++
|
|
case staleRelay:
|
|
posture.StaleRelayCount++
|
|
}
|
|
if !valid || expired || renewalNeeded || staleRelay {
|
|
posture.RefreshNeededCount++
|
|
if posture.Reason == "" {
|
|
posture.Reason = meshRendezvousLeaseRefreshReason(valid, expired, renewalNeeded, staleRelay)
|
|
}
|
|
}
|
|
}
|
|
posture.RefreshNeeded = posture.RefreshNeededCount > 0 && identity.NodeID != ""
|
|
if posture.Reason == "" {
|
|
posture.Reason = "none"
|
|
}
|
|
return posture
|
|
}
|
|
|
|
func meshRendezvousLeaseRefreshReason(valid bool, expired bool, renewalNeeded bool, staleRelay bool) string {
|
|
switch {
|
|
case !valid:
|
|
return "invalid_lease"
|
|
case expired:
|
|
return "expired_lease"
|
|
case staleRelay:
|
|
return "stale_relay"
|
|
case renewalNeeded:
|
|
return "renewal_needed"
|
|
default:
|
|
return "none"
|
|
}
|
|
}
|
|
|
|
func syntheticRoutesFromControlPlane(routes []client.SyntheticMeshRouteConfig) []mesh.SyntheticRoute {
|
|
out := make([]mesh.SyntheticRoute, 0, len(routes))
|
|
for _, route := range routes {
|
|
out = append(out, mesh.SyntheticRoute{
|
|
RouteID: route.RouteID,
|
|
ClusterID: route.ClusterID,
|
|
SourceNodeID: route.SourceNodeID,
|
|
DestinationNodeID: route.DestinationNodeID,
|
|
Hops: route.Hops,
|
|
AllowedChannels: route.AllowedChannels,
|
|
ExpiresAt: route.ExpiresAt,
|
|
MaxTTL: route.MaxTTL,
|
|
MaxHops: route.MaxHops,
|
|
RouteVersion: route.RouteVersion,
|
|
PolicyVersion: route.PolicyVersion,
|
|
PeerDirectoryVersion: route.PeerDirectoryVersion,
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func routeHealthRoutesFromPathDecisions(routes []mesh.SyntheticRoute, report *client.RoutePathDecisionReport) []mesh.SyntheticRoute {
|
|
out := make([]mesh.SyntheticRoute, 0, len(routes))
|
|
routeIndex := map[string]int{}
|
|
for _, route := range routes {
|
|
if route.RouteID == "" {
|
|
continue
|
|
}
|
|
routeIndex[route.RouteID] = len(out)
|
|
out = append(out, cloneSyntheticRoute(route))
|
|
}
|
|
if report == nil {
|
|
return out
|
|
}
|
|
for _, decision := range report.Decisions {
|
|
if strings.TrimSpace(decision.RouteID) == "" || decision.ProductionForwarding || !decision.ControlPlaneOnly {
|
|
continue
|
|
}
|
|
hops := cleanNodePath(decision.EffectiveHops)
|
|
if len(hops) < 2 {
|
|
continue
|
|
}
|
|
route, ok := mesh.SyntheticRoute{}, false
|
|
if index, exists := routeIndex[decision.RouteID]; exists {
|
|
route = out[index]
|
|
ok = true
|
|
}
|
|
if !ok {
|
|
route = mesh.SyntheticRoute{
|
|
RouteID: decision.RouteID,
|
|
ClusterID: decision.ClusterID,
|
|
AllowedChannels: []string{mesh.SyntheticChannelFabricControl, mesh.SyntheticChannelRouteControl},
|
|
}
|
|
routeIndex[decision.RouteID] = len(out)
|
|
out = append(out, route)
|
|
}
|
|
route.Hops = hops
|
|
route.SourceNodeID = defaultString(decision.SourceNodeID, hops[0])
|
|
route.DestinationNodeID = defaultString(decision.DestinationNodeID, hops[len(hops)-1])
|
|
route.ClusterID = defaultString(decision.ClusterID, route.ClusterID)
|
|
if !decision.ExpiresAt.IsZero() {
|
|
route.ExpiresAt = decision.ExpiresAt
|
|
}
|
|
if strings.TrimSpace(decision.Generation) != "" {
|
|
route.RouteVersion = strings.TrimSpace(decision.Generation)
|
|
}
|
|
if route.MaxTTL < len(hops) {
|
|
route.MaxTTL = len(hops)
|
|
}
|
|
if route.MaxHops < len(hops)-1 {
|
|
route.MaxHops = len(hops) - 1
|
|
}
|
|
out[routeIndex[decision.RouteID]] = route
|
|
}
|
|
return out
|
|
}
|
|
|
|
func cloneSyntheticRoute(route mesh.SyntheticRoute) mesh.SyntheticRoute {
|
|
route.Hops = append([]string{}, route.Hops...)
|
|
route.AllowedChannels = append([]string{}, route.AllowedChannels...)
|
|
return route
|
|
}
|
|
|
|
func cleanNodePath(items []string) []string {
|
|
out := make([]string, 0, len(items))
|
|
for _, item := range items {
|
|
item = strings.TrimSpace(item)
|
|
if item != "" {
|
|
out = append(out, item)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func peerEndpointCandidatesFromControlPlane(candidates map[string][]client.PeerEndpointCandidate) map[string][]mesh.PeerEndpointCandidate {
|
|
out := make(map[string][]mesh.PeerEndpointCandidate, len(candidates))
|
|
for nodeID, items := range candidates {
|
|
for _, item := range items {
|
|
out[nodeID] = append(out[nodeID], mesh.PeerEndpointCandidate{
|
|
EndpointID: item.EndpointID,
|
|
NodeID: item.NodeID,
|
|
Transport: item.Transport,
|
|
Address: item.Address,
|
|
AddressFamily: item.AddressFamily,
|
|
Reachability: item.Reachability,
|
|
NATType: item.NATType,
|
|
ConnectivityMode: item.ConnectivityMode,
|
|
Region: item.Region,
|
|
Priority: item.Priority,
|
|
PolicyTags: item.PolicyTags,
|
|
LastVerifiedAt: item.LastVerifiedAt,
|
|
Metadata: item.Metadata,
|
|
})
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func endpointCandidateObservationsFromControlPlane(observations map[string]client.EndpointCandidateHealthObservation) map[string]mesh.EndpointCandidateHealthObservation {
|
|
out := make(map[string]mesh.EndpointCandidateHealthObservation, len(observations))
|
|
for endpointID, item := range observations {
|
|
endpointID = strings.TrimSpace(endpointID)
|
|
if endpointID == "" {
|
|
continue
|
|
}
|
|
out[endpointID] = mesh.EndpointCandidateHealthObservation{
|
|
EndpointID: firstNonEmpty(strings.TrimSpace(item.EndpointID), endpointID),
|
|
Source: item.Source,
|
|
ReporterNodeID: item.ReporterNodeID,
|
|
LastLatencyMs: item.LastLatencyMs,
|
|
SuccessCount: item.SuccessCount,
|
|
FailureCount: item.FailureCount,
|
|
LastFailureReason: item.LastFailureReason,
|
|
ReliabilityScore: item.ReliabilityScore,
|
|
ObservedAt: item.ObservedAt,
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func peerDirectoryFromControlPlane(entries []client.PeerDirectoryEntry) []mesh.PeerDirectoryEntry {
|
|
out := make([]mesh.PeerDirectoryEntry, 0, len(entries))
|
|
for _, item := range entries {
|
|
out = append(out, mesh.PeerDirectoryEntry{
|
|
NodeID: item.NodeID,
|
|
RouteIDs: item.RouteIDs,
|
|
EndpointCount: item.EndpointCount,
|
|
CandidateCount: item.CandidateCount,
|
|
ConnectivityModes: item.ConnectivityModes,
|
|
RecoverySeed: item.RecoverySeed,
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func recoverySeedsFromControlPlane(seeds []client.PeerRecoverySeed) []mesh.PeerRecoverySeed {
|
|
out := make([]mesh.PeerRecoverySeed, 0, len(seeds))
|
|
for _, item := range seeds {
|
|
out = append(out, mesh.PeerRecoverySeed{
|
|
NodeID: item.NodeID,
|
|
Endpoint: item.Endpoint,
|
|
Transport: item.Transport,
|
|
ConnectivityMode: item.ConnectivityMode,
|
|
Region: item.Region,
|
|
Priority: item.Priority,
|
|
LastVerifiedAt: item.LastVerifiedAt,
|
|
Metadata: item.Metadata,
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func rendezvousLeasesFromControlPlane(leases []client.PeerRendezvousLease) []mesh.PeerRendezvousLease {
|
|
out := make([]mesh.PeerRendezvousLease, 0, len(leases))
|
|
for _, item := range leases {
|
|
out = append(out, mesh.PeerRendezvousLease{
|
|
LeaseID: item.LeaseID,
|
|
PeerNodeID: item.PeerNodeID,
|
|
RelayNodeID: item.RelayNodeID,
|
|
RelayEndpoint: item.RelayEndpoint,
|
|
Transport: item.Transport,
|
|
ConnectivityMode: item.ConnectivityMode,
|
|
RouteIDs: item.RouteIDs,
|
|
AllowedChannels: item.AllowedChannels,
|
|
Priority: item.Priority,
|
|
ControlPlaneOnly: item.ControlPlaneOnly,
|
|
IssuedAt: item.IssuedAt,
|
|
ExpiresAt: item.ExpiresAt,
|
|
Reason: item.Reason,
|
|
Metadata: item.Metadata,
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func parseMeshPeerEndpoints(raw string) (map[string]string, error) {
|
|
if raw == "" {
|
|
return map[string]string{}, nil
|
|
}
|
|
var peerEndpoints map[string]string
|
|
if err := json.Unmarshal([]byte(raw), &peerEndpoints); err != nil {
|
|
return nil, fmt.Errorf("parse synthetic mesh peer endpoints: %w", err)
|
|
}
|
|
return peerEndpoints, nil
|
|
}
|
|
|
|
func parseMeshSyntheticRoutes(raw string) ([]mesh.SyntheticRoute, error) {
|
|
if raw == "" {
|
|
return nil, nil
|
|
}
|
|
var routes []mesh.SyntheticRoute
|
|
if err := json.Unmarshal([]byte(raw), &routes); err != nil {
|
|
return nil, fmt.Errorf("parse synthetic mesh routes: %w", err)
|
|
}
|
|
return routes, nil
|
|
}
|
|
|
|
func reportSyntheticRouteHealth(ctx context.Context, cfg config.Config, api *client.Client, identity state.Identity, meshState *syntheticMeshState) error {
|
|
if meshState == nil || meshState.Runtime == nil || api == nil {
|
|
return nil
|
|
}
|
|
routes := meshState.RouteHealthRoutes
|
|
if len(routes) == 0 {
|
|
routes = meshState.Routes
|
|
}
|
|
decisionsByRoute := routePathDecisionsByRoute(meshState.RoutePathDecisions)
|
|
var refreshTrigger *meshRouteHealthFeedbackTrigger
|
|
for _, route := range routes {
|
|
if route.SourceNodeID != identity.NodeID {
|
|
continue
|
|
}
|
|
decision, decisionApplied := decisionsByRoute[route.RouteID]
|
|
probeCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
|
result, err := meshState.Runtime.SendRouteHealthProbe(probeCtx, route.RouteID, mesh.SyntheticChannelFabricControl, "route-health-"+route.RouteID)
|
|
cancel()
|
|
if err != nil {
|
|
metadata := routeHealthObservationMetadata(meshState, route, decision, decisionApplied, nil)
|
|
metadata["failure_reason"] = err.Error()
|
|
if reportErr := api.ReportMeshLink(ctx, identity.ClusterID, client.MeshLinkObservationRequest{
|
|
SourceNodeID: identity.NodeID,
|
|
TargetNodeID: route.DestinationNodeID,
|
|
LinkStatus: "unreachable",
|
|
Metadata: metadata,
|
|
}); reportErr != nil {
|
|
return reportErr
|
|
}
|
|
if trigger, ok := routeHealthFeedbackTriggerFromObservation(route, decision, decisionApplied, "unreachable", metadata, time.Now().UTC()); ok && refreshTrigger == nil {
|
|
refreshTrigger = &trigger
|
|
}
|
|
continue
|
|
}
|
|
latency := int(result.Observation.LastLatencyMs)
|
|
qualityScore := syntheticQualityScore(latency)
|
|
ackPath := routeHealthAckPath(result.Ack)
|
|
metadata := routeHealthObservationMetadata(meshState, route, decision, decisionApplied, ackPath)
|
|
metadata["selected_route_id"] = result.SelectedRouteID
|
|
metadata["fallback_used"] = result.FallbackUsed
|
|
metadata["route_version"] = result.Observation.RouteVersion
|
|
metadata["policy_version"] = result.Observation.PolicyVersion
|
|
metadata["peer_directory_version"] = result.Observation.PeerDirectoryVersion
|
|
metadata["synthetic_message_type"] = result.Ack.MessageType
|
|
if err := api.ReportMeshLink(ctx, identity.ClusterID, client.MeshLinkObservationRequest{
|
|
SourceNodeID: identity.NodeID,
|
|
TargetNodeID: route.DestinationNodeID,
|
|
LinkStatus: "reachable",
|
|
LatencyMs: &latency,
|
|
QualityScore: &qualityScore,
|
|
Metadata: metadata,
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
if trigger, ok := routeHealthFeedbackTriggerFromObservation(route, decision, decisionApplied, "reachable", metadata, time.Now().UTC()); ok && refreshTrigger == nil {
|
|
refreshTrigger = &trigger
|
|
}
|
|
}
|
|
if refreshTrigger != nil {
|
|
return refreshSyntheticMeshConfigForRouteHealthFeedback(ctx, cfg, identity, api, meshState, *refreshTrigger, time.Now().UTC())
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func routePathDecisionsByRoute(report *client.RoutePathDecisionReport) map[string]client.RoutePathDecision {
|
|
out := map[string]client.RoutePathDecision{}
|
|
if report == nil {
|
|
return out
|
|
}
|
|
for _, decision := range report.Decisions {
|
|
if strings.TrimSpace(decision.RouteID) == "" {
|
|
continue
|
|
}
|
|
previous, exists := out[decision.RouteID]
|
|
if !exists || (previous.DecisionSource != "stale_relay_replacement" && decision.DecisionSource == "stale_relay_replacement") {
|
|
out[decision.RouteID] = decision
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func routeHealthObservationMetadata(meshState *syntheticMeshState, route mesh.SyntheticRoute, decision client.RoutePathDecision, decisionApplied bool, ackPath []string) map[string]any {
|
|
driftDetected := false
|
|
if len(ackPath) > 0 {
|
|
driftDetected = !sameStringSlice(ackPath, route.Hops)
|
|
}
|
|
metadata := map[string]any{
|
|
"stage": "c17z20",
|
|
"traffic_forwarding": false,
|
|
"production_forwarding": false,
|
|
"production_payload_forwarding": false,
|
|
"service_workload_traffic": false,
|
|
"observation_type": "synthetic_route_health",
|
|
"route_id": route.RouteID,
|
|
"config_source": meshState.Source,
|
|
"route_health_route_config_contract": "control_plane_route_path_decisions_to_synthetic_route_health",
|
|
"route_health_only": true,
|
|
"synthetic_route_health_route_path_runtime": true,
|
|
"production_route_path_forwarding_runtime": false,
|
|
"route_path_decision_applied": decisionApplied,
|
|
"expected_effective_hops": append([]string{}, route.Hops...),
|
|
"observed_ack_path": append([]string{}, ackPath...),
|
|
"route_path_drift_detected": driftDetected,
|
|
"control_plane_only": true,
|
|
"route_health_service_payload_forwarding": false,
|
|
"route_health_production_payload_forwarding": false,
|
|
}
|
|
if decisionApplied {
|
|
metadata["route_path_decision_id"] = decision.DecisionID
|
|
metadata["route_path_decision_generation"] = decision.Generation
|
|
metadata["route_path_decision_source"] = decision.DecisionSource
|
|
metadata["route_path_decision_next_hop_id"] = decision.NextHopID
|
|
metadata["route_path_decision_selected_relay_id"] = decision.SelectedRelayID
|
|
metadata["route_path_decision_stale_relay_node_id"] = decision.StaleRelayNodeID
|
|
metadata["route_path_decision_rendezvous_peer_node_id"] = decision.RendezvousPeerNodeID
|
|
metadata["route_path_decision_rendezvous_lease_id"] = decision.RendezvousLeaseID
|
|
metadata["route_path_decision_rendezvous_lease_reason"] = decision.RendezvousLeaseReason
|
|
metadata["route_path_decision_effective_hops"] = append([]string{}, decision.EffectiveHops...)
|
|
metadata["route_path_decision_original_hops"] = append([]string{}, decision.OriginalHops...)
|
|
}
|
|
return metadata
|
|
}
|
|
|
|
func routeHealthFeedbackTriggerFromObservation(route mesh.SyntheticRoute, decision client.RoutePathDecision, decisionApplied bool, linkStatus string, metadata map[string]any, observedAt time.Time) (meshRouteHealthFeedbackTrigger, bool) {
|
|
if strings.TrimSpace(route.RouteID) == "" {
|
|
return meshRouteHealthFeedbackTrigger{}, false
|
|
}
|
|
linkStatus = strings.TrimSpace(linkStatus)
|
|
failureReason, _ := metadata["failure_reason"].(string)
|
|
driftDetected, _ := metadata["route_path_drift_detected"].(bool)
|
|
reason := ""
|
|
switch {
|
|
case strings.TrimSpace(failureReason) != "":
|
|
reason = "synthetic_route_health_failure"
|
|
case linkStatus != "" && linkStatus != "reachable":
|
|
reason = "synthetic_route_health_unreachable"
|
|
case driftDetected:
|
|
reason = "synthetic_route_health_drift"
|
|
default:
|
|
return meshRouteHealthFeedbackTrigger{}, false
|
|
}
|
|
trigger := meshRouteHealthFeedbackTrigger{
|
|
Reason: reason,
|
|
RouteID: route.RouteID,
|
|
PeerNodeID: route.DestinationNodeID,
|
|
LinkStatus: linkStatus,
|
|
FailureReason: failureReason,
|
|
DriftDetected: driftDetected,
|
|
ObservedAt: observedAt.UTC(),
|
|
}
|
|
if decisionApplied {
|
|
if decision.RendezvousPeerNodeID != "" {
|
|
trigger.PeerNodeID = decision.RendezvousPeerNodeID
|
|
}
|
|
trigger.SelectedRelayID = decision.SelectedRelayID
|
|
}
|
|
return trigger, true
|
|
}
|
|
|
|
func routeHealthAckPath(ack mesh.SyntheticEnvelope) []string {
|
|
if len(ack.Payload) == 0 {
|
|
return nil
|
|
}
|
|
var payload mesh.SyntheticProbeAckPayload
|
|
if err := json.Unmarshal(ack.Payload, &payload); err != nil {
|
|
return nil
|
|
}
|
|
return append([]string{}, payload.Path...)
|
|
}
|
|
|
|
func probeWarmPeerHealth(ctx context.Context, api *client.Client, identity state.Identity, meshState *syntheticMeshState) error {
|
|
if meshState == nil || meshState.PeerCache == nil {
|
|
return nil
|
|
}
|
|
if meshState.PeerConnectionManager != nil {
|
|
cycle := meshState.PeerConnectionManager.ProbeOnce(ctx)
|
|
meshState.LastPeerRecoveryPlan = &cycle.RecoveryPlan
|
|
meshState.LastPeerConnectionIntent = &cycle.IntentPlan
|
|
for _, result := range cycle.Results {
|
|
metadata := map[string]any{
|
|
"stage": "c17z20",
|
|
"traffic_forwarding": false,
|
|
"observation_type": "peer_connection_manager",
|
|
"config_source": meshState.Source,
|
|
"manager_probe_status": result.LinkStatus,
|
|
"manager_mode": cycle.Mode,
|
|
"manager_attempted": cycle.Attempted,
|
|
"manager_succeeded": cycle.Succeeded,
|
|
"manager_failed": cycle.Failed,
|
|
"manager_deferred": cycle.Deferred,
|
|
"manager_rendezvous_required": cycle.RendezvousRequiredCount,
|
|
"manager_rendezvous_resolved": cycle.RendezvousResolvedCount,
|
|
"manager_relay_control": cycle.RelayControlCount,
|
|
"connection_intent_action": result.Action,
|
|
"connection_intent_reason": result.Reason,
|
|
"transport_mode": result.TransportMode,
|
|
"requires_rendezvous": result.RequiresRendezvous,
|
|
"rendezvous_resolved": result.RendezvousResolved,
|
|
"direct_candidate": result.DirectCandidate,
|
|
"relay_candidate": result.RelayCandidate,
|
|
"rendezvous_lease_id": result.RendezvousLeaseID,
|
|
"relay_node_id": result.RelayNodeID,
|
|
"relay_endpoint": result.RelayEndpoint,
|
|
"connection_state": result.ConnectionState.State,
|
|
"consecutive_successes": result.ConnectionState.ConsecutiveSuccesses,
|
|
"consecutive_failures": result.ConnectionState.ConsecutiveFailures,
|
|
"backoff_until": result.ConnectionState.BackoffUntil,
|
|
"service_workload_traffic": false,
|
|
"persistent_connection_manager": true,
|
|
"persistent_connection_kind": "http_keepalive_control_health_or_relay_control_health",
|
|
}
|
|
if result.FailureReason != "" {
|
|
metadata["failure_reason"] = result.FailureReason
|
|
}
|
|
var latency *int
|
|
var qualityScore *int
|
|
if result.LinkStatus == mesh.PeerConnectionProbeReachable {
|
|
latency = &result.LatencyMs
|
|
score := syntheticQualityScore(result.LatencyMs)
|
|
qualityScore = &score
|
|
}
|
|
if err := api.ReportMeshLink(ctx, identity.ClusterID, client.MeshLinkObservationRequest{
|
|
SourceNodeID: identity.NodeID,
|
|
TargetNodeID: result.NodeID,
|
|
LinkStatus: meshLinkStatusFromPeerProbe(result.LinkStatus),
|
|
LatencyMs: latency,
|
|
QualityScore: qualityScore,
|
|
Metadata: metadata,
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID}
|
|
plan := peerRecoveryPlan(meshState, time.Now().UTC())
|
|
meshState.LastPeerRecoveryPlan = &plan
|
|
intentPlan := peerConnectionIntentPlan(meshState, plan, time.Now().UTC())
|
|
meshState.LastPeerConnectionIntent = &intentPlan
|
|
intentsByNode := peerConnectionIntentsByNode(intentPlan)
|
|
for _, candidate := range plan.Candidates {
|
|
if strings.TrimSpace(candidate.Endpoint) == "" {
|
|
continue
|
|
}
|
|
intent := intentsByNode[candidate.NodeID]
|
|
now := time.Now().UTC()
|
|
if meshState.PeerConnections != nil && !meshState.PeerConnections.ShouldProbe(candidate.NodeID, now) {
|
|
continue
|
|
}
|
|
entry := mesh.PeerCacheEntry{
|
|
NodeID: candidate.NodeID,
|
|
Endpoint: candidate.Endpoint,
|
|
Warm: candidate.Warm,
|
|
WarmReason: candidate.WarmReason,
|
|
RecoverySeed: candidate.RecoverySeed,
|
|
BestCandidateID: candidate.BestCandidateID,
|
|
BestTransport: candidate.BestTransport,
|
|
}
|
|
if meshState.PeerConnections != nil {
|
|
meshState.PeerConnections.BeginProbe(entry, now)
|
|
}
|
|
startedAt := time.Now()
|
|
probeCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
|
|
_, err := mesh.NewClient(strings.TrimRight(entry.Endpoint, "/")).SendHealth(probeCtx, mesh.NewHealthMessage(local, mesh.PeerIdentity{
|
|
ClusterID: identity.ClusterID,
|
|
NodeID: candidate.NodeID,
|
|
}))
|
|
cancel()
|
|
if err != nil {
|
|
connectionState := mesh.PeerConnectionState{}
|
|
if meshState.PeerConnections != nil {
|
|
connectionState = meshState.PeerConnections.RecordFailure(candidate.NodeID, err.Error(), time.Now().UTC())
|
|
}
|
|
if reportErr := api.ReportMeshLink(ctx, identity.ClusterID, client.MeshLinkObservationRequest{
|
|
SourceNodeID: identity.NodeID,
|
|
TargetNodeID: candidate.NodeID,
|
|
LinkStatus: "unreachable",
|
|
Metadata: map[string]any{
|
|
"stage": "c17z10",
|
|
"traffic_forwarding": false,
|
|
"observation_type": "warm_peer_health",
|
|
"config_source": meshState.Source,
|
|
"warm_reason": entry.WarmReason,
|
|
"best_candidate_id": entry.BestCandidateID,
|
|
"best_transport": entry.BestTransport,
|
|
"recovery_seed": entry.RecoverySeed,
|
|
"recovery_plan_mode": plan.Mode,
|
|
"recovery_probe_reason": candidate.Reason,
|
|
"recovery_target_ready": plan.TargetReadyPeers,
|
|
"recovery_ready_peers": plan.ReadyPeerCount,
|
|
"recovery_deficit": plan.Deficit,
|
|
"connection_intent_action": intent.Action,
|
|
"transport_mode": intent.TransportMode,
|
|
"requires_rendezvous": intent.RequiresRendezvous,
|
|
"direct_candidate": intent.DirectCandidate,
|
|
"connection_state": connectionState.State,
|
|
"consecutive_failures": connectionState.ConsecutiveFailures,
|
|
"backoff_until": connectionState.BackoffUntil,
|
|
"failure_reason": err.Error(),
|
|
"service_workload_traffic": false,
|
|
},
|
|
}); reportErr != nil {
|
|
return reportErr
|
|
}
|
|
continue
|
|
}
|
|
latency := int(time.Since(startedAt).Milliseconds())
|
|
qualityScore := syntheticQualityScore(latency)
|
|
connectionState := mesh.PeerConnectionState{}
|
|
if meshState.PeerConnections != nil {
|
|
connectionState = meshState.PeerConnections.RecordSuccess(candidate.NodeID, latency, time.Now().UTC())
|
|
}
|
|
if err := api.ReportMeshLink(ctx, identity.ClusterID, client.MeshLinkObservationRequest{
|
|
SourceNodeID: identity.NodeID,
|
|
TargetNodeID: candidate.NodeID,
|
|
LinkStatus: "reachable",
|
|
LatencyMs: &latency,
|
|
QualityScore: &qualityScore,
|
|
Metadata: map[string]any{
|
|
"stage": "c17z10",
|
|
"traffic_forwarding": false,
|
|
"observation_type": "warm_peer_health",
|
|
"config_source": meshState.Source,
|
|
"warm_reason": entry.WarmReason,
|
|
"best_candidate_id": entry.BestCandidateID,
|
|
"best_transport": entry.BestTransport,
|
|
"recovery_seed": entry.RecoverySeed,
|
|
"recovery_plan_mode": plan.Mode,
|
|
"recovery_probe_reason": candidate.Reason,
|
|
"recovery_target_ready": plan.TargetReadyPeers,
|
|
"recovery_ready_peers": plan.ReadyPeerCount,
|
|
"recovery_deficit": plan.Deficit,
|
|
"connection_intent_action": intent.Action,
|
|
"transport_mode": intent.TransportMode,
|
|
"requires_rendezvous": intent.RequiresRendezvous,
|
|
"direct_candidate": intent.DirectCandidate,
|
|
"connection_state": connectionState.State,
|
|
"consecutive_successes": connectionState.ConsecutiveSuccesses,
|
|
"service_workload_traffic": false,
|
|
},
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func meshLinkStatusFromPeerProbe(status string) string {
|
|
switch status {
|
|
case mesh.PeerConnectionProbeReachable:
|
|
return "reachable"
|
|
case mesh.PeerConnectionProbeUnreachable:
|
|
return "unreachable"
|
|
case mesh.PeerConnectionProbeDeferred:
|
|
return "degraded"
|
|
case mesh.PeerConnectionProbeSkipped:
|
|
return "unknown"
|
|
default:
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
func peerRecoveryPlan(meshState *syntheticMeshState, now time.Time) mesh.PeerRecoveryPlan {
|
|
if meshState == nil || meshState.PeerCache == nil {
|
|
return mesh.PeerRecoveryPlan{}
|
|
}
|
|
var connections mesh.PeerConnectionSnapshot
|
|
if meshState.PeerConnections != nil {
|
|
connections = meshState.PeerConnections.Snapshot()
|
|
}
|
|
return mesh.PlanPeerRecovery(mesh.PeerRecoveryPlanConfig{
|
|
PeerCache: meshState.PeerCache.Snapshot(),
|
|
Connections: connections,
|
|
TargetReadyPeers: mesh.DefaultStablePeerTarget,
|
|
MaxProbeCandidates: mesh.DefaultRecoveryProbeLimit,
|
|
Now: now,
|
|
})
|
|
}
|
|
|
|
func peerConnectionIntentPlan(meshState *syntheticMeshState, recoveryPlan mesh.PeerRecoveryPlan, now time.Time) mesh.PeerConnectionIntentPlan {
|
|
if meshState == nil || meshState.PeerCache == nil {
|
|
return mesh.PeerConnectionIntentPlan{}
|
|
}
|
|
return mesh.PlanPeerConnectionIntents(mesh.PeerConnectionIntentPlanConfig{
|
|
PeerCache: meshState.PeerCache.Snapshot(),
|
|
RecoveryPlan: recoveryPlan,
|
|
RendezvousLeases: meshState.RendezvousLeases,
|
|
Now: now,
|
|
})
|
|
}
|
|
|
|
func peerConnectionIntentsByNode(plan mesh.PeerConnectionIntentPlan) map[string]mesh.PeerConnectionIntent {
|
|
out := map[string]mesh.PeerConnectionIntent{}
|
|
for _, intent := range plan.Intents {
|
|
if strings.TrimSpace(intent.NodeID) != "" {
|
|
out[intent.NodeID] = intent
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func syntheticQualityScore(latencyMs int) int {
|
|
switch {
|
|
case latencyMs <= 10:
|
|
return 100
|
|
case latencyMs >= 1000:
|
|
return 1
|
|
default:
|
|
score := 100 - latencyMs/10
|
|
if score < 1 {
|
|
return 1
|
|
}
|
|
return score
|
|
}
|
|
}
|
|
|
|
func sendHeartbeat(ctx context.Context, api *client.Client, cfg config.Config, identity state.Identity, meshState *syntheticMeshState) (client.EffectiveTestingFlags, error) {
|
|
if identity.NodeID == "" || identity.ClusterID == "" {
|
|
return client.EffectiveTestingFlags{}, fmt.Errorf("node identity is not approved")
|
|
}
|
|
response, err := api.Heartbeat(ctx, identity.ClusterID, identity.NodeID, heartbeatPayload(cfg, identity, meshState, time.Now().UTC()))
|
|
if err == nil {
|
|
log.Printf("heartbeat sent: node_id=%s cluster_id=%s", identity.NodeID, identity.ClusterID)
|
|
if err := persistUpdateHintTrigger(cfg.StateDir, response.UpdateHint); err != nil {
|
|
log.Printf("update hint trigger failed: %v", err)
|
|
}
|
|
}
|
|
return response.TestingFlags, err
|
|
}
|
|
|
|
func persistUpdateHintTrigger(stateDir string, hint *client.NodeUpdateHint) error {
|
|
if hint == nil || !hint.CheckNow || strings.TrimSpace(hint.Generation) == "" {
|
|
return nil
|
|
}
|
|
current := hostagent.CurrentUpdateTriggerGenerationForNodeAgent(stateDir)
|
|
if current == strings.TrimSpace(hint.Generation) {
|
|
return nil
|
|
}
|
|
return hostagent.SaveUpdateTrigger(stateDir, hostagent.UpdateTrigger{
|
|
SchemaVersion: "rap.node_update_trigger.v1",
|
|
Generation: strings.TrimSpace(hint.Generation),
|
|
Products: hint.Products,
|
|
Reason: hint.Reason,
|
|
DeliveryMode: hint.DeliveryMode,
|
|
SubscriptionStatus: hint.SubscriptionStatus,
|
|
FallbackPollSeconds: hint.FallbackPollSeconds,
|
|
UpdateServiceNodeID: func() string {
|
|
if hint.UpdateService == nil {
|
|
return ""
|
|
}
|
|
return hint.UpdateService.NodeID
|
|
}(),
|
|
UpdateServiceStatus: func() string {
|
|
if hint.UpdateService == nil {
|
|
return ""
|
|
}
|
|
return hint.UpdateService.Status
|
|
}(),
|
|
ObservedAt: time.Now().UTC(),
|
|
})
|
|
}
|
|
|
|
func heartbeatPayload(cfg config.Config, identity state.Identity, meshState *syntheticMeshState, observedAt time.Time) client.HeartbeatRequest {
|
|
if meshState != nil && meshState.ListenerRuntimeConfig.BackendURL != "" {
|
|
cfg = meshState.ListenerRuntimeConfig
|
|
}
|
|
payload := agent.HeartbeatPayload()
|
|
candidates, err := advertisedEndpointCandidates(cfg, identity, meshState, observedAt)
|
|
if err != nil {
|
|
log.Printf("mesh endpoint report skipped: %v", err)
|
|
return payload
|
|
}
|
|
if len(candidates) == 0 && (meshState == nil || (meshState.PeerCache == nil && meshState.ListenerReport.SchemaVersion == "")) {
|
|
return payload
|
|
}
|
|
if payload.Metadata == nil {
|
|
payload.Metadata = map[string]any{}
|
|
}
|
|
if payload.Capabilities == nil {
|
|
payload.Capabilities = map[string]any{}
|
|
}
|
|
payload.Metadata["stage"] = "c17z20"
|
|
if meshState != nil && meshState.ListenerReport.SchemaVersion != "" {
|
|
report := meshState.ListenerReport
|
|
report.ObservedAt = observedAt.UTC().Format(time.RFC3339Nano)
|
|
payload.Metadata["mesh_listener_report"] = report
|
|
payload.Capabilities["mesh_listener_diagnostics"] = true
|
|
if report.OneWayConnectivity {
|
|
payload.Capabilities["mesh_one_way_connectivity"] = true
|
|
}
|
|
if report.Status == "listen_failed" && cfg.MeshConnectivityMode != "outbound_only" {
|
|
payload.HealthStatus = "warning"
|
|
}
|
|
}
|
|
if cfg.MeshSyntheticRuntimeEnabled {
|
|
payload.Metadata["mesh_outbound_session_report"] = meshOutboundSessionReportFromState(cfg, meshState, observedAt)
|
|
payload.Capabilities["mesh_outbound_control_session"] = true
|
|
payload.Capabilities["mesh_reverse_control_channel_contract"] = true
|
|
if meshState != nil && meshState.ServiceChannelAccessStats != nil {
|
|
payload.Metadata["fabric_service_channel_access_report"] = meshState.ServiceChannelAccessStats.Report(observedAt)
|
|
payload.Capabilities["fabric_service_channel_access_telemetry"] = true
|
|
}
|
|
if cfg.MeshProductionForwardingEnabled || (meshState != nil && meshState.ProductionForwardingEnabled) {
|
|
payload.Capabilities["mesh_production_forwarding"] = true
|
|
}
|
|
if cfg.MeshFabricSessionEnabled {
|
|
report := map[string]any{
|
|
"schema_version": "rap.fabric_session_endpoint_report.v1",
|
|
"enabled": true,
|
|
"transport": "websocket_binary_frames",
|
|
"path": "/mesh/v1/fabric/session/ws",
|
|
"auth": "rap_fsn_token_with_optional_signed_authority",
|
|
"protocol": "rap.fabric_data_session.v1",
|
|
"service_neutral": true,
|
|
"traffic_isolation": "logical_streams",
|
|
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
|
|
}
|
|
if meshState != nil && cfg.MeshQUICFabricEnabled {
|
|
report["quic"] = map[string]any{
|
|
"enabled": meshState.QUICFabricServer != nil,
|
|
"listen_addr": cfg.MeshQUICFabricListenAddr,
|
|
"effective_listen_addr": meshState.QUICFabricListenAddr,
|
|
"tls_cert_sha256": meshState.QUICFabricCertSHA256,
|
|
"error": meshState.QUICFabricError,
|
|
}
|
|
}
|
|
payload.Metadata["fabric_session_endpoint_report"] = report
|
|
payload.Capabilities["fabric_session_websocket_endpoint"] = true
|
|
payload.Capabilities["fabric_data_session_v1"] = true
|
|
if cfg.MeshQUICFabricEnabled {
|
|
payload.Capabilities["fabric_quic_endpoint"] = true
|
|
}
|
|
}
|
|
if cfg.VPNFabricSessionTransportEnabled {
|
|
report := map[string]any{
|
|
"schema_version": "rap.vpn_fabric_session_transport_report.v1",
|
|
"enabled": true,
|
|
"transport": "fabric_session_binary_frames",
|
|
"carriers": []string{"quic", "websocket"},
|
|
"packet_payload": "rap.vpn_packet_batch.fabric.v1",
|
|
"gated": true,
|
|
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
|
|
}
|
|
if meshState != nil && meshState.VPNFabricTransport != nil {
|
|
report["peer_sessions"] = meshState.VPNFabricTransport.Snapshot()
|
|
} else if meshState != nil && meshState.VPNFabricSessionPeers != nil {
|
|
report["peer_sessions"] = meshState.VPNFabricSessionPeers.Snapshot()
|
|
}
|
|
if meshState != nil && meshState.VPNFabricQUICTransport != nil {
|
|
quicSnapshot := meshState.VPNFabricQUICTransport.Snapshot()
|
|
report["quic_sessions"] = quicSnapshot
|
|
report["quic_max_streams_per_conn"] = meshState.VPNFabricQUICTransport.MaxStreamsPerConn
|
|
report["quic_idle_ttl_seconds"] = int(meshState.VPNFabricQUICTransport.IdleTTL.Seconds())
|
|
}
|
|
if meshState != nil && meshState.VPNFabricSessionDialStats != nil {
|
|
report["dial_stats"] = meshState.VPNFabricSessionDialStats.Report(observedAt)
|
|
}
|
|
payload.Metadata["vpn_fabric_session_transport_report"] = report
|
|
payload.Capabilities["vpn_fabric_session_transport"] = true
|
|
payload.Capabilities["vpn_packet_batch_binary_frames"] = true
|
|
if meshState != nil && meshState.VPNFabricEndpointObservations != nil {
|
|
payload.Metadata["vpn_fabric_endpoint_health_report"] = meshState.VPNFabricEndpointObservations.Report(observedAt, maxVPNFabricEndpointHealthReportEntries)
|
|
} else {
|
|
payload.Metadata["vpn_fabric_endpoint_health_report"] = newVPNFabricEndpointObservationStore(identity.NodeID).Report(observedAt, maxVPNFabricEndpointHealthReportEntries)
|
|
}
|
|
payload.Capabilities["vpn_fabric_endpoint_health_feedback"] = true
|
|
}
|
|
if meshState != nil && meshState.ConfigLoadError != "" {
|
|
payload.HealthStatus = "warning"
|
|
}
|
|
}
|
|
if len(candidates) > 0 {
|
|
payload.Metadata["mesh_endpoint_report"] = meshEndpointReport(cfg, identity, meshState, observedAt, candidates)
|
|
payload.Capabilities["mesh_dynamic_endpoint_reporting"] = true
|
|
}
|
|
if meshState != nil && meshState.PeerCache != nil {
|
|
payload.Metadata["mesh_peer_recovery_report"] = meshPeerRecoveryReport(meshState, observedAt)
|
|
payload.Metadata["mesh_peer_connection_intent_report"] = meshPeerConnectionIntentReport(meshState, observedAt)
|
|
payload.Metadata["mesh_peer_connection_manager_report"] = meshPeerConnectionManagerReport(meshState, observedAt)
|
|
payload.Capabilities["mesh_peer_cache_endpoint_health_ranking"] = true
|
|
payload.Metadata["mesh_rendezvous_lease_report"] = meshRendezvousLeaseReport(meshState, identity, observedAt)
|
|
payload.Metadata["mesh_route_path_decision_report"] = meshRoutePathDecisionReport(meshState, identity, observedAt)
|
|
payload.Metadata["mesh_route_generation_report"] = meshRouteGenerationReport(meshState, identity, observedAt)
|
|
payload.Metadata["mesh_route_health_config_report"] = meshRouteHealthConfigReport(meshState, identity, observedAt)
|
|
payload.Metadata["mesh_route_health_feedback_refresh_report"] = meshRouteHealthFeedbackRefreshReport(meshState, identity, observedAt)
|
|
payload.Capabilities["mesh_peer_recovery_planning"] = true
|
|
payload.Capabilities["mesh_peer_connection_intent_planning"] = true
|
|
payload.Capabilities["mesh_peer_connection_manager"] = true
|
|
payload.Capabilities["mesh_per_peer_endpoint_probe_fallback"] = true
|
|
payload.Capabilities["mesh_rendezvous_relay_control_contract"] = true
|
|
payload.Capabilities[meshRendezvousLeaseTelemetryCapability] = true
|
|
payload.Capabilities[meshRendezvousLeaseRefreshCapability] = true
|
|
payload.Capabilities[meshRendezvousRelayReplacementCapability] = true
|
|
payload.Capabilities[meshRoutePathDecisionCapability] = true
|
|
payload.Capabilities[meshRouteGenerationTrackerCapability] = true
|
|
payload.Capabilities[meshRouteHealthConfigCapability] = true
|
|
payload.Capabilities[meshRouteHealthFeedbackRefreshCapability] = true
|
|
}
|
|
if meshState != nil && (meshState.VPNFabricIngress != nil || meshState.VPNFabricInbox != nil) {
|
|
payload.Metadata["fabric_service_channel_runtime_report"] = fabricServiceChannelRuntimeReport(meshState, identity, observedAt)
|
|
payload.Capabilities["fabric_service_channel_runtime"] = true
|
|
payload.Capabilities["fabric_service_channel_route_manager"] = true
|
|
}
|
|
return payload
|
|
}
|
|
|
|
func fabricServiceChannelRuntimeReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any {
|
|
report := map[string]any{
|
|
"schema_version": "c18l.fabric_service_channel_runtime_report.v1",
|
|
"cluster_id": identity.ClusterID,
|
|
"node_id": identity.NodeID,
|
|
"service_class": "vpn_packets",
|
|
"channel_class": mesh.ProductionChannelVPNPacket,
|
|
"route_manager": "primary_sticky_with_alternate_route_failover",
|
|
"backend_relay_fallback": false,
|
|
"backend_relay_fallback_position": "disabled_farm_owned_dataplane",
|
|
"route_authority": "fabric_farm",
|
|
"application_protocol_agnostic": true,
|
|
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
|
|
}
|
|
if meshState == nil {
|
|
report["enabled"] = false
|
|
return report
|
|
}
|
|
report["enabled"] = meshState.VPNFabricIngress != nil
|
|
report["production_payload_forwarding"] = meshState.ProductionForwardingEnabled
|
|
report["route_candidate_total"] = countVPNPacketRoutes(meshState.Routes, identity.ClusterID, identity.NodeID)
|
|
report["config_source"] = meshState.Source
|
|
report["config_version"] = meshState.ConfigVersion
|
|
if meshState.VPNFabricIngress != nil {
|
|
report["ingress"] = meshState.VPNFabricIngress.Snapshot(identity.ClusterID)
|
|
}
|
|
if meshState.VPNFabricInbox != nil {
|
|
report["inbox"] = meshState.VPNFabricInbox.Snapshot()
|
|
}
|
|
return report
|
|
}
|
|
|
|
func countVPNPacketRoutes(routes []mesh.SyntheticRoute, clusterID string, localNodeID string) int {
|
|
count := 0
|
|
now := time.Now().UTC()
|
|
for _, route := range routes {
|
|
if route.ClusterID != clusterID || route.SourceNodeID != localNodeID || !containsString(route.AllowedChannels, mesh.ProductionChannelVPNPacket) {
|
|
continue
|
|
}
|
|
if !route.ExpiresAt.IsZero() && !route.ExpiresAt.After(now) {
|
|
continue
|
|
}
|
|
nextHop := serviceChannelNextHopAfter(route.Hops, localNodeID, route.DestinationNodeID)
|
|
if nextHop == "" || nextHop == localNodeID {
|
|
continue
|
|
}
|
|
count++
|
|
}
|
|
return count
|
|
}
|
|
|
|
func serviceChannelNextHopAfter(path []string, localNodeID string, destinationNodeID string) string {
|
|
if len(path) == 0 {
|
|
return destinationNodeID
|
|
}
|
|
for index, nodeID := range path {
|
|
if nodeID == localNodeID {
|
|
if index+1 < len(path) {
|
|
return path[index+1]
|
|
}
|
|
return localNodeID
|
|
}
|
|
}
|
|
return destinationNodeID
|
|
}
|
|
|
|
func meshOutboundSessionReportFromState(cfg config.Config, meshState *syntheticMeshState, observedAt time.Time) meshOutboundSessionReport {
|
|
report := meshOutboundSessionReport{
|
|
SchemaVersion: "c17z22.mesh_outbound_session_report.v1",
|
|
Status: "ready",
|
|
Direction: "node_to_control_plane",
|
|
Transport: "heartbeat_keepalive",
|
|
ControlPlaneURL: cfg.BackendURL,
|
|
ConnectivityMode: defaultString(cfg.MeshConnectivityMode, "direct"),
|
|
InboundListenerRequired: false,
|
|
ProductionForwarding: false,
|
|
ServiceWorkloadTraffic: false,
|
|
ObservedAt: observedAt.UTC().Format(time.RFC3339Nano),
|
|
}
|
|
if meshState != nil {
|
|
listener := meshState.ListenerReport
|
|
report.ListenerStatus = listener.Status
|
|
report.ListenerFailureReason = listener.FailureReason
|
|
report.ListenerPortConflict = listener.PortConflict
|
|
report.ConfigLoadError = meshState.ConfigLoadError
|
|
report.UsableForInboundControl = listener.Status == "listening" ||
|
|
listener.Status == "auto_rebound" ||
|
|
listener.OneWayConnectivity ||
|
|
listener.Status == "listen_failed" ||
|
|
cfg.MeshConnectivityMode == "outbound_only"
|
|
if meshState.PeerConnections != nil {
|
|
snapshot := meshState.PeerConnections.Snapshot()
|
|
report.PeerConnectionReady = snapshot.Ready
|
|
report.PeerConnectionRelayReady = snapshot.RelayReady
|
|
report.PeerConnectionWaiting = snapshot.Waiting
|
|
}
|
|
report.RendezvousLeaseCount = len(meshState.RendezvousLeases)
|
|
if meshState.ConfigLoadError != "" {
|
|
report.Status = "degraded"
|
|
report.ListenerFailureReason = firstNonEmpty(report.ListenerFailureReason, "mesh_config_load_failed")
|
|
}
|
|
} else {
|
|
report.UsableForInboundControl = cfg.MeshConnectivityMode == "outbound_only"
|
|
}
|
|
return report
|
|
}
|
|
|
|
func meshEndpointReport(cfg config.Config, identity state.Identity, meshState *syntheticMeshState, observedAt time.Time, candidates []mesh.PeerEndpointCandidate) map[string]any {
|
|
transport := strings.TrimSpace(candidates[0].Transport)
|
|
if transport == "" {
|
|
transport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
|
|
}
|
|
if transport == "" {
|
|
transport = "direct_tcp_tls"
|
|
}
|
|
connectivityMode := cfg.MeshConnectivityMode
|
|
if connectivityMode == "" {
|
|
connectivityMode = "direct"
|
|
}
|
|
natType := cfg.MeshNATType
|
|
if natType == "" {
|
|
natType = "unknown"
|
|
}
|
|
report := map[string]any{
|
|
"schema_version": "c17z6.mesh_endpoint_report.v1",
|
|
"cluster_id": identity.ClusterID,
|
|
"node_id": identity.NodeID,
|
|
"peer_endpoint": candidates[0].Address,
|
|
"transport": transport,
|
|
"connectivity_mode": connectivityMode,
|
|
"nat_type": natType,
|
|
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
|
|
"endpoint_candidates": candidates,
|
|
}
|
|
if meshState != nil && meshState.PeerCache != nil {
|
|
snapshot := meshState.PeerCache.Snapshot()
|
|
report["peer_cache_peers"] = snapshot.PeerCount
|
|
report["warm_peers"] = snapshot.WarmPeerCount
|
|
report["recovery_seeds"] = snapshot.RecoverySeedCount
|
|
report["rendezvous_leases"] = snapshot.RendezvousLeaseCount
|
|
}
|
|
if meshState != nil && meshState.PeerConnections != nil {
|
|
snapshot := meshState.PeerConnections.Snapshot()
|
|
report["peer_connection_total"] = snapshot.Total
|
|
report["peer_connection_ready"] = snapshot.Ready
|
|
report["peer_connection_relay_ready"] = snapshot.RelayReady
|
|
report["peer_connection_degraded"] = snapshot.Degraded
|
|
report["peer_connection_backoff"] = snapshot.Backoff
|
|
report["peer_connection_waiting_rendezvous"] = snapshot.Waiting
|
|
report["peer_connection_connecting"] = snapshot.Connecting
|
|
report["peer_connection_disconnected"] = snapshot.Disconnected
|
|
}
|
|
if meshState != nil && meshState.PeerCache != nil {
|
|
plan := peerRecoveryPlan(meshState, observedAt)
|
|
meshState.LastPeerRecoveryPlan = &plan
|
|
intentPlan := peerConnectionIntentPlan(meshState, plan, observedAt)
|
|
meshState.LastPeerConnectionIntent = &intentPlan
|
|
report["peer_recovery_mode"] = plan.Mode
|
|
report["peer_recovery_healthy"] = plan.Healthy
|
|
report["peer_recovery_target_ready"] = plan.TargetReadyPeers
|
|
report["peer_recovery_ready"] = plan.ReadyPeerCount
|
|
report["peer_recovery_deficit"] = plan.Deficit
|
|
report["peer_recovery_probe_candidates"] = plan.ProbeCandidateCount
|
|
report["peer_recovery_seed_candidates"] = plan.RecoverySeedCandidateCount
|
|
report["peer_connection_intents"] = intentPlan.IntentCount
|
|
report["peer_connection_intent_direct"] = intentPlan.DirectCount
|
|
report["peer_connection_intent_private_lan"] = intentPlan.PrivateLANCount
|
|
report["peer_connection_intent_corp_lan"] = intentPlan.CorporateLANCount
|
|
report["peer_connection_intent_outbound_only"] = intentPlan.OutboundOnlyCount
|
|
report["peer_connection_intent_relay_required"] = intentPlan.RelayRequiredCount
|
|
report["peer_connection_intent_relay_control"] = intentPlan.RelayControlCount
|
|
report["peer_connection_intent_rendezvous_required"] = intentPlan.RendezvousRequiredCount
|
|
report["peer_connection_intent_rendezvous_resolved"] = intentPlan.RendezvousResolvedCount
|
|
report["rendezvous_lease_count"] = intentPlan.RendezvousLeaseCount
|
|
}
|
|
if cfg.MeshRegion != "" {
|
|
report["region"] = cfg.MeshRegion
|
|
}
|
|
return report
|
|
}
|
|
|
|
func meshPeerRecoveryReport(meshState *syntheticMeshState, observedAt time.Time) map[string]any {
|
|
plan := peerRecoveryPlan(meshState, observedAt)
|
|
meshState.LastPeerRecoveryPlan = &plan
|
|
intentPlan := peerConnectionIntentPlan(meshState, plan, observedAt)
|
|
meshState.LastPeerConnectionIntent = &intentPlan
|
|
report := map[string]any{
|
|
"schema_version": "c17z9.mesh_peer_recovery_report.v1",
|
|
"mode": plan.Mode,
|
|
"healthy": plan.Healthy,
|
|
"target_ready_peers": plan.TargetReadyPeers,
|
|
"ready_peer_count": plan.ReadyPeerCount,
|
|
"degraded_peer_count": plan.DegradedPeerCount,
|
|
"backoff_peer_count": plan.BackoffPeerCount,
|
|
"connectable_peer_count": plan.ConnectablePeerCount,
|
|
"deficit": plan.Deficit,
|
|
"probe_candidate_count": plan.ProbeCandidateCount,
|
|
"recovery_seed_candidate_count": plan.RecoverySeedCandidateCount,
|
|
"service_workload_traffic": false,
|
|
"production_payload_forwarding": false,
|
|
"persistent_connection_transport": false,
|
|
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
|
|
"connection_intent_count": intentPlan.IntentCount,
|
|
"rendezvous_required_count": intentPlan.RendezvousRequiredCount,
|
|
"rendezvous_resolved_count": intentPlan.RendezvousResolvedCount,
|
|
"rendezvous_lease_count": intentPlan.RendezvousLeaseCount,
|
|
"relay_control_count": intentPlan.RelayControlCount,
|
|
}
|
|
if meshState != nil && meshState.PeerConnections != nil {
|
|
snapshot := meshState.PeerConnections.Snapshot()
|
|
report["peer_connection_total"] = snapshot.Total
|
|
report["peer_connection_ready"] = snapshot.Ready
|
|
report["peer_connection_relay_ready"] = snapshot.RelayReady
|
|
report["peer_connection_degraded"] = snapshot.Degraded
|
|
report["peer_connection_backoff"] = snapshot.Backoff
|
|
report["peer_connection_waiting_rendezvous"] = snapshot.Waiting
|
|
report["peer_connection_connecting"] = snapshot.Connecting
|
|
report["peer_connection_disconnected"] = snapshot.Disconnected
|
|
}
|
|
return report
|
|
}
|
|
|
|
func meshPeerConnectionIntentReport(meshState *syntheticMeshState, observedAt time.Time) map[string]any {
|
|
recoveryPlan := peerRecoveryPlan(meshState, observedAt)
|
|
meshState.LastPeerRecoveryPlan = &recoveryPlan
|
|
intentPlan := peerConnectionIntentPlan(meshState, recoveryPlan, observedAt)
|
|
meshState.LastPeerConnectionIntent = &intentPlan
|
|
return map[string]any{
|
|
"schema_version": "c17z12.mesh_peer_connection_intent_report.v1",
|
|
"mode": intentPlan.Mode,
|
|
"intent_count": intentPlan.IntentCount,
|
|
"maintain_count": intentPlan.MaintainCount,
|
|
"probe_count": intentPlan.ProbeCount,
|
|
"recover_count": intentPlan.RecoverCount,
|
|
"direct_count": intentPlan.DirectCount,
|
|
"private_lan_count": intentPlan.PrivateLANCount,
|
|
"corporate_lan_count": intentPlan.CorporateLANCount,
|
|
"outbound_only_count": intentPlan.OutboundOnlyCount,
|
|
"relay_required_count": intentPlan.RelayRequiredCount,
|
|
"relay_control_count": intentPlan.RelayControlCount,
|
|
"rendezvous_required_count": intentPlan.RendezvousRequiredCount,
|
|
"rendezvous_resolved_count": intentPlan.RendezvousResolvedCount,
|
|
"rendezvous_lease_count": intentPlan.RendezvousLeaseCount,
|
|
"service_workload_traffic": false,
|
|
"production_payload_forwarding": false,
|
|
"persistent_connection_transport": false,
|
|
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
|
|
}
|
|
}
|
|
|
|
func meshPeerConnectionManagerReport(meshState *syntheticMeshState, observedAt time.Time) map[string]any {
|
|
report := map[string]any{
|
|
"schema_version": "c17z25.mesh_peer_connection_manager_report.v1",
|
|
"service_workload_traffic": false,
|
|
"production_payload_forwarding": false,
|
|
"persistent_connection_transport": true,
|
|
"persistent_connection_kind": "http_keepalive_control_health_or_relay_control_health",
|
|
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
|
|
}
|
|
if meshState == nil || meshState.PeerConnectionManager == nil {
|
|
report["enabled"] = false
|
|
return report
|
|
}
|
|
report["enabled"] = true
|
|
snapshot := meshState.PeerConnectionManager.Snapshot()
|
|
cycle := snapshot.LastCycle
|
|
report["mode"] = cycle.Mode
|
|
report["intent_count"] = cycle.IntentCount
|
|
report["attempted"] = cycle.Attempted
|
|
report["succeeded"] = cycle.Succeeded
|
|
report["failed"] = cycle.Failed
|
|
report["deferred"] = cycle.Deferred
|
|
report["skipped"] = cycle.Skipped
|
|
report["rendezvous_required_count"] = cycle.RendezvousRequiredCount
|
|
report["rendezvous_resolved_count"] = cycle.RendezvousResolvedCount
|
|
report["relay_control_count"] = cycle.RelayControlCount
|
|
report["last_started_at"] = cycle.StartedAt
|
|
report["last_completed_at"] = cycle.CompletedAt
|
|
report["probe_results"] = cycle.Results
|
|
if meshState.PeerConnections != nil {
|
|
connectionSnapshot := meshState.PeerConnections.Snapshot()
|
|
report["peer_connection_ready"] = connectionSnapshot.Ready
|
|
report["peer_connection_relay_ready"] = connectionSnapshot.RelayReady
|
|
report["peer_connection_degraded"] = connectionSnapshot.Degraded
|
|
report["peer_connection_backoff"] = connectionSnapshot.Backoff
|
|
report["peer_connection_waiting_rendezvous"] = connectionSnapshot.Waiting
|
|
}
|
|
return report
|
|
}
|
|
|
|
func meshRendezvousLeaseReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any {
|
|
observedAt = observedAt.UTC()
|
|
posture := meshRendezvousLeasePostureForState(meshState, identity, observedAt)
|
|
report := map[string]any{
|
|
"schema_version": meshRendezvousLeaseReportSchema,
|
|
"cluster_id": identity.ClusterID,
|
|
"node_id": identity.NodeID,
|
|
"lease_count": len(meshState.RendezvousLeases),
|
|
"config_source": meshState.Source,
|
|
"config_version": meshState.ConfigVersion,
|
|
"peer_directory_version": meshState.PeerDirectoryVersion,
|
|
"policy_version": meshState.PolicyVersion,
|
|
"renewal_window_ms": int64(meshRendezvousLeaseRenewalWindow / time.Millisecond),
|
|
"refresh_backoff_ms": int64(meshRendezvousLeaseRefreshBackoff / time.Millisecond),
|
|
"refresh_contract": "node_scoped_synthetic_config_get",
|
|
"refresh_supported": meshState.Source == "control_plane",
|
|
"control_plane_only": true,
|
|
"relay_payload_forwarding": false,
|
|
"service_workload_traffic": false,
|
|
"production_payload_forwarding": false,
|
|
"persistent_connection_transport": true,
|
|
"observed_at": observedAt.Format(time.RFC3339Nano),
|
|
}
|
|
connectionByPeer := map[string]mesh.PeerConnectionState{}
|
|
if meshState != nil && meshState.PeerConnections != nil {
|
|
snapshot := meshState.PeerConnections.Snapshot()
|
|
report["peer_connection_total"] = snapshot.Total
|
|
report["peer_connection_ready"] = snapshot.Ready
|
|
report["peer_connection_relay_ready"] = snapshot.RelayReady
|
|
report["peer_connection_degraded"] = snapshot.Degraded
|
|
report["peer_connection_backoff"] = snapshot.Backoff
|
|
report["peer_connection_waiting_rendezvous"] = snapshot.Waiting
|
|
report["peer_connection_connecting"] = snapshot.Connecting
|
|
report["peer_connection_disconnected"] = snapshot.Disconnected
|
|
for _, entry := range snapshot.Entries {
|
|
connectionByPeer[entry.NodeID] = entry
|
|
}
|
|
}
|
|
if meshState == nil {
|
|
report["leases"] = []map[string]any{}
|
|
return report
|
|
}
|
|
|
|
leaseDetails := make([]map[string]any, 0, minInt(len(meshState.RendezvousLeases), maxMeshRendezvousLeaseReportEntries))
|
|
activeCount := 0
|
|
usableCount := 0
|
|
controlPlaneOnlyCount := 0
|
|
invalidCount := 0
|
|
expiredCount := 0
|
|
expiringSoonCount := 0
|
|
renewalNeededCount := 0
|
|
admittedAsRelayCount := 0
|
|
admittedAsPeerCount := 0
|
|
entryObserverCount := 0
|
|
relayControlReadyCount := 0
|
|
staleRelayCount := 0
|
|
withdrawalNeededCount := 0
|
|
reselectionNeededCount := 0
|
|
for index, lease := range meshState.RendezvousLeases {
|
|
role := meshRendezvousLeaseRole(lease, identity.NodeID)
|
|
valid := meshRendezvousLeaseBaseValid(lease)
|
|
expired := valid && !lease.ExpiresAt.After(observedAt)
|
|
usable := valid && !expired
|
|
if lease.ControlPlaneOnly {
|
|
controlPlaneOnlyCount++
|
|
}
|
|
if !valid {
|
|
invalidCount++
|
|
}
|
|
if expired {
|
|
expiredCount++
|
|
}
|
|
if usable {
|
|
activeCount++
|
|
usableCount++
|
|
switch role {
|
|
case "relay":
|
|
admittedAsRelayCount++
|
|
case "peer":
|
|
admittedAsPeerCount++
|
|
default:
|
|
entryObserverCount++
|
|
}
|
|
}
|
|
ttlRemaining := lease.ExpiresAt.Sub(observedAt)
|
|
renewalAfter := meshRendezvousLeaseRenewalAfter(lease)
|
|
expiringSoon := usable && ttlRemaining <= meshRendezvousLeaseRenewalWindow
|
|
renewalNeeded := meshRendezvousLeaseRenewalNeeded(lease, observedAt, usable)
|
|
if expiringSoon {
|
|
expiringSoonCount++
|
|
}
|
|
if renewalNeeded {
|
|
renewalNeededCount++
|
|
}
|
|
connectionState := connectionByPeer[lease.PeerNodeID]
|
|
staleRelay := usable && meshRendezvousLeaseStaleRelay(lease, connectionState)
|
|
withdrawalNeeded := staleRelay && role == "relay"
|
|
reselectionNeeded := staleRelay && role != "relay"
|
|
if staleRelay {
|
|
staleRelayCount++
|
|
}
|
|
if withdrawalNeeded {
|
|
withdrawalNeededCount++
|
|
}
|
|
if reselectionNeeded {
|
|
reselectionNeededCount++
|
|
}
|
|
relayReady := usable && connectionState.State == mesh.PeerConnectionRelayReady
|
|
if relayReady {
|
|
relayControlReadyCount++
|
|
}
|
|
if index < maxMeshRendezvousLeaseReportEntries {
|
|
leaseDetails = append(leaseDetails, map[string]any{
|
|
"lease_id": lease.LeaseID,
|
|
"peer_node_id": lease.PeerNodeID,
|
|
"relay_node_id": lease.RelayNodeID,
|
|
"relay_endpoint": strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/"),
|
|
"transport": defaultString(lease.Transport, "relay_control"),
|
|
"connectivity_mode": defaultString(lease.ConnectivityMode, "relay_required"),
|
|
"route_ids": append([]string{}, lease.RouteIDs...),
|
|
"allowed_channels": append([]string{}, lease.AllowedChannels...),
|
|
"priority": lease.Priority,
|
|
"role": role,
|
|
"status": meshRendezvousLeaseStatus(valid, expired, renewalNeeded, role),
|
|
"usable": usable,
|
|
"admitted": usable && role == "relay",
|
|
"renewal_needed": renewalNeeded,
|
|
"expiring_soon": expiringSoon,
|
|
"stale_relay": staleRelay,
|
|
"withdrawal_needed": withdrawalNeeded,
|
|
"reselection_needed": reselectionNeeded,
|
|
"relay_ready": relayReady,
|
|
"connection_state": connectionState.State,
|
|
"ttl_remaining_ms": int64(ttlRemaining / time.Millisecond),
|
|
"issued_at": formatOptionalTime(lease.IssuedAt),
|
|
"expires_at": formatOptionalTime(lease.ExpiresAt),
|
|
"renewal_after": formatOptionalTime(renewalAfter),
|
|
"reason": lease.Reason,
|
|
})
|
|
}
|
|
}
|
|
report["active_count"] = activeCount
|
|
report["usable_count"] = usableCount
|
|
report["control_plane_only_count"] = controlPlaneOnlyCount
|
|
report["invalid_count"] = invalidCount
|
|
report["expired_count"] = expiredCount
|
|
report["expiring_soon_count"] = expiringSoonCount
|
|
report["renewal_needed_count"] = renewalNeededCount
|
|
report["admitted_as_relay_count"] = admittedAsRelayCount
|
|
report["admitted_as_peer_count"] = admittedAsPeerCount
|
|
report["entry_observer_count"] = entryObserverCount
|
|
report["relay_control_ready_count"] = relayControlReadyCount
|
|
report["stale_relay_count"] = staleRelayCount
|
|
report["withdrawal_needed_count"] = withdrawalNeededCount
|
|
report["reselection_needed_count"] = reselectionNeededCount
|
|
report["refresh_needed"] = posture.RefreshNeeded
|
|
report["refresh_reason"] = posture.Reason
|
|
report["refresh_needed_count"] = posture.RefreshNeededCount
|
|
report["refresh_attempt_count"] = meshState.LeaseRefreshAttempts
|
|
report["refresh_success_count"] = meshState.LeaseRefreshSuccesses
|
|
report["refresh_failure_count"] = meshState.LeaseRefreshFailures
|
|
if meshState.LastLeaseRefresh != nil {
|
|
report["last_refresh_status"] = meshState.LastLeaseRefresh.Status
|
|
report["last_refresh_reason"] = meshState.LastLeaseRefresh.Reason
|
|
report["last_refresh_error"] = meshState.LastLeaseRefresh.Error
|
|
report["last_refresh_attempted_at"] = formatOptionalTime(meshState.LastLeaseRefresh.AttemptedAt)
|
|
report["last_refresh_completed_at"] = formatOptionalTime(meshState.LastLeaseRefresh.CompletedAt)
|
|
report["last_refresh_previous_lease_count"] = meshState.LastLeaseRefresh.PreviousLeaseCount
|
|
report["last_refresh_refreshed_lease_count"] = meshState.LastLeaseRefresh.RefreshedLeaseCount
|
|
report["last_refresh_config_version"] = meshState.LastLeaseRefresh.ConfigVersion
|
|
}
|
|
report["truncated"] = len(meshState.RendezvousLeases) > maxMeshRendezvousLeaseReportEntries
|
|
report["leases"] = leaseDetails
|
|
return report
|
|
}
|
|
|
|
type meshRouteGenerationDecisionState struct {
|
|
DecisionID string
|
|
RouteID string
|
|
Generation string
|
|
DecisionSource string
|
|
LocalRole string
|
|
PreviousHopID string
|
|
NextHopID string
|
|
SelectedRelayID string
|
|
StaleRelayNodeID string
|
|
RendezvousLeaseID string
|
|
EffectiveHops []string
|
|
OriginalHops []string
|
|
PathScore int
|
|
Status string
|
|
ApplyStatus string
|
|
WithdrawStatus string
|
|
AppliedAt time.Time
|
|
WithdrawnAt time.Time
|
|
ControlPlaneOnly bool
|
|
ProductionForwarding bool
|
|
}
|
|
|
|
type meshRouteGenerationTracker struct {
|
|
Generation string
|
|
PreviousGeneration string
|
|
LastAppliedAt time.Time
|
|
LastChangedAt time.Time
|
|
LastAppliedCount int
|
|
LastWithdrawnCount int
|
|
LastUnchangedCount int
|
|
TotalAppliedCount int
|
|
TotalWithdrawnCount int
|
|
Active map[string]meshRouteGenerationDecisionState
|
|
Withdrawn []meshRouteGenerationDecisionState
|
|
}
|
|
|
|
func newMeshRouteGenerationTracker(report *client.RoutePathDecisionReport, observedAt time.Time) *meshRouteGenerationTracker {
|
|
tracker := &meshRouteGenerationTracker{
|
|
Active: map[string]meshRouteGenerationDecisionState{},
|
|
}
|
|
tracker.Apply(report, observedAt)
|
|
return tracker
|
|
}
|
|
|
|
func (t *meshRouteGenerationTracker) Apply(report *client.RoutePathDecisionReport, observedAt time.Time) {
|
|
if t == nil {
|
|
return
|
|
}
|
|
if observedAt.IsZero() {
|
|
observedAt = time.Now().UTC()
|
|
} else {
|
|
observedAt = observedAt.UTC()
|
|
}
|
|
nextGeneration := ""
|
|
decisions := []client.RoutePathDecision{}
|
|
if report != nil {
|
|
nextGeneration = strings.TrimSpace(report.Generation)
|
|
decisions = append(decisions, report.Decisions...)
|
|
}
|
|
t.PreviousGeneration = t.Generation
|
|
t.Generation = nextGeneration
|
|
t.LastAppliedAt = observedAt
|
|
t.LastAppliedCount = 0
|
|
t.LastWithdrawnCount = 0
|
|
t.LastUnchangedCount = 0
|
|
nextActive := map[string]meshRouteGenerationDecisionState{}
|
|
seen := map[string]struct{}{}
|
|
withdrawnRelayKeys := map[string]struct{}{}
|
|
for _, previous := range t.Withdrawn {
|
|
if key := routeGenerationWithdrawnRelayKey(previous); key != "" {
|
|
withdrawnRelayKeys[key] = struct{}{}
|
|
}
|
|
}
|
|
appliedReplacementDecisions := []client.RoutePathDecision{}
|
|
for _, decision := range decisions {
|
|
state := routeGenerationDecisionState(decision, observedAt)
|
|
key := routeGenerationDecisionKey(decision)
|
|
if key == "" {
|
|
continue
|
|
}
|
|
seen[key] = struct{}{}
|
|
if previous, ok := t.Active[key]; ok && routeGenerationDecisionSame(previous, decision) {
|
|
state.Status = "active"
|
|
state.ApplyStatus = "unchanged"
|
|
state.AppliedAt = previous.AppliedAt
|
|
t.LastUnchangedCount++
|
|
} else {
|
|
state.Status = "active"
|
|
state.ApplyStatus = "applied"
|
|
state.AppliedAt = observedAt
|
|
t.LastAppliedCount++
|
|
t.TotalAppliedCount++
|
|
if decision.DecisionSource == "stale_relay_replacement" && strings.TrimSpace(decision.StaleRelayNodeID) != "" {
|
|
appliedReplacementDecisions = append(appliedReplacementDecisions, decision)
|
|
}
|
|
}
|
|
nextActive[key] = state
|
|
}
|
|
for key, previous := range t.Active {
|
|
if _, ok := seen[key]; ok {
|
|
continue
|
|
}
|
|
previous.Status = "withdrawn"
|
|
previous.ApplyStatus = "not_active"
|
|
previous.WithdrawStatus = "withdrawn"
|
|
previous.WithdrawnAt = observedAt
|
|
t.Withdrawn = append([]meshRouteGenerationDecisionState{previous}, t.Withdrawn...)
|
|
if relayKey := routeGenerationWithdrawnRelayKey(previous); relayKey != "" {
|
|
withdrawnRelayKeys[relayKey] = struct{}{}
|
|
}
|
|
t.LastWithdrawnCount++
|
|
t.TotalWithdrawnCount++
|
|
}
|
|
for _, decision := range appliedReplacementDecisions {
|
|
relayKey := routeGenerationRouteRelayKey(decision.RouteID, decision.StaleRelayNodeID)
|
|
if relayKey == "" {
|
|
continue
|
|
}
|
|
if _, alreadyWithdrawn := withdrawnRelayKeys[relayKey]; alreadyWithdrawn {
|
|
continue
|
|
}
|
|
withdrawn := routeGenerationReplacementWithdrawnDecisionState(decision, observedAt)
|
|
t.Withdrawn = append([]meshRouteGenerationDecisionState{withdrawn}, t.Withdrawn...)
|
|
withdrawnRelayKeys[relayKey] = struct{}{}
|
|
t.LastWithdrawnCount++
|
|
t.TotalWithdrawnCount++
|
|
}
|
|
if len(t.Withdrawn) > maxMeshRendezvousLeaseReportEntries {
|
|
t.Withdrawn = t.Withdrawn[:maxMeshRendezvousLeaseReportEntries]
|
|
}
|
|
if t.LastAppliedCount > 0 || t.LastWithdrawnCount > 0 || t.PreviousGeneration != t.Generation {
|
|
t.LastChangedAt = observedAt
|
|
}
|
|
t.Active = nextActive
|
|
}
|
|
|
|
func routeGenerationDecisionState(decision client.RoutePathDecision, observedAt time.Time) meshRouteGenerationDecisionState {
|
|
return meshRouteGenerationDecisionState{
|
|
DecisionID: decision.DecisionID,
|
|
RouteID: decision.RouteID,
|
|
Generation: decision.Generation,
|
|
DecisionSource: decision.DecisionSource,
|
|
LocalRole: decision.LocalRole,
|
|
PreviousHopID: decision.PreviousHopID,
|
|
NextHopID: decision.NextHopID,
|
|
SelectedRelayID: decision.SelectedRelayID,
|
|
StaleRelayNodeID: decision.StaleRelayNodeID,
|
|
RendezvousLeaseID: decision.RendezvousLeaseID,
|
|
EffectiveHops: append([]string{}, decision.EffectiveHops...),
|
|
OriginalHops: append([]string{}, decision.OriginalHops...),
|
|
PathScore: decision.PathScore,
|
|
Status: "active",
|
|
ApplyStatus: "applied",
|
|
WithdrawStatus: "not_withdrawn",
|
|
AppliedAt: observedAt,
|
|
ControlPlaneOnly: decision.ControlPlaneOnly,
|
|
ProductionForwarding: decision.ProductionForwarding,
|
|
}
|
|
}
|
|
|
|
func routeGenerationReplacementWithdrawnDecisionState(decision client.RoutePathDecision, observedAt time.Time) meshRouteGenerationDecisionState {
|
|
withdrawnDecisionID := strings.TrimSpace(decision.RouteID) + "-path-withdrawn-stale-relay-" + strings.TrimSpace(decision.StaleRelayNodeID)
|
|
effectiveHops := append([]string{}, decision.OriginalHops...)
|
|
if len(effectiveHops) == 0 {
|
|
effectiveHops = append([]string{}, decision.EffectiveHops...)
|
|
}
|
|
return meshRouteGenerationDecisionState{
|
|
DecisionID: withdrawnDecisionID,
|
|
RouteID: decision.RouteID,
|
|
Generation: decision.Generation,
|
|
DecisionSource: "stale_relay_withdrawn",
|
|
LocalRole: decision.LocalRole,
|
|
PreviousHopID: decision.PreviousHopID,
|
|
NextHopID: decision.StaleRelayNodeID,
|
|
SelectedRelayID: decision.SelectedRelayID,
|
|
StaleRelayNodeID: decision.StaleRelayNodeID,
|
|
RendezvousLeaseID: decision.RendezvousLeaseID,
|
|
EffectiveHops: effectiveHops,
|
|
OriginalHops: append([]string{}, decision.OriginalHops...),
|
|
PathScore: decision.PathScore,
|
|
Status: "withdrawn",
|
|
ApplyStatus: "not_active",
|
|
WithdrawStatus: "withdrawn_by_replacement",
|
|
WithdrawnAt: observedAt,
|
|
ControlPlaneOnly: decision.ControlPlaneOnly,
|
|
ProductionForwarding: decision.ProductionForwarding,
|
|
}
|
|
}
|
|
|
|
func routeGenerationDecisionKey(decision client.RoutePathDecision) string {
|
|
if strings.TrimSpace(decision.DecisionID) != "" {
|
|
return strings.TrimSpace(decision.DecisionID)
|
|
}
|
|
if strings.TrimSpace(decision.RouteID) == "" {
|
|
return ""
|
|
}
|
|
return strings.TrimSpace(decision.RouteID) + "\x00" + strings.TrimSpace(decision.LocalNodeID)
|
|
}
|
|
|
|
func routeGenerationRouteRelayKey(routeID string, relayNodeID string) string {
|
|
routeID = strings.TrimSpace(routeID)
|
|
relayNodeID = strings.TrimSpace(relayNodeID)
|
|
if routeID == "" || relayNodeID == "" {
|
|
return ""
|
|
}
|
|
return routeID + "\x00" + relayNodeID
|
|
}
|
|
|
|
func routeGenerationWithdrawnRelayKey(state meshRouteGenerationDecisionState) string {
|
|
relayNodeID := strings.TrimSpace(state.StaleRelayNodeID)
|
|
if relayNodeID == "" {
|
|
relayNodeID = strings.TrimSpace(state.NextHopID)
|
|
}
|
|
return routeGenerationRouteRelayKey(state.RouteID, relayNodeID)
|
|
}
|
|
|
|
func routeGenerationDecisionSame(previous meshRouteGenerationDecisionState, decision client.RoutePathDecision) bool {
|
|
return previous.Generation == decision.Generation &&
|
|
previous.RouteID == decision.RouteID &&
|
|
previous.DecisionSource == decision.DecisionSource &&
|
|
previous.NextHopID == decision.NextHopID &&
|
|
previous.SelectedRelayID == decision.SelectedRelayID &&
|
|
strings.Join(previous.EffectiveHops, "\x00") == strings.Join(decision.EffectiveHops, "\x00")
|
|
}
|
|
|
|
func meshRouteGenerationReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any {
|
|
observedAt = observedAt.UTC()
|
|
report := map[string]any{
|
|
"schema_version": meshRouteGenerationReportSchema,
|
|
"cluster_id": identity.ClusterID,
|
|
"node_id": identity.NodeID,
|
|
"config_source": "",
|
|
"config_version": "",
|
|
"generation": "",
|
|
"previous_generation": "",
|
|
"tracker_contract": "node_side_route_generation_apply_withdraw",
|
|
"control_plane_only": true,
|
|
"production_payload_forwarding": false,
|
|
"service_workload_traffic": false,
|
|
"route_path_forwarding_runtime": false,
|
|
"observed_at": observedAt.Format(time.RFC3339Nano),
|
|
"active_decision_count": 0,
|
|
"applied_decision_count": 0,
|
|
"unchanged_decision_count": 0,
|
|
"withdrawn_decision_count": 0,
|
|
"total_applied_decision_count": 0,
|
|
"total_withdrawn_decision_count": 0,
|
|
"generation_changed": false,
|
|
"active_decisions": []map[string]any{},
|
|
"withdrawn_decisions": []map[string]any{},
|
|
}
|
|
if meshState == nil || meshState.RouteGenerationTracker == nil {
|
|
return report
|
|
}
|
|
tracker := meshState.RouteGenerationTracker
|
|
report["config_source"] = meshState.Source
|
|
report["config_version"] = meshState.ConfigVersion
|
|
report["generation"] = tracker.Generation
|
|
report["previous_generation"] = tracker.PreviousGeneration
|
|
report["last_applied_at"] = formatOptionalTime(tracker.LastAppliedAt)
|
|
report["last_changed_at"] = formatOptionalTime(tracker.LastChangedAt)
|
|
report["active_decision_count"] = len(tracker.Active)
|
|
report["applied_decision_count"] = tracker.LastAppliedCount
|
|
report["unchanged_decision_count"] = tracker.LastUnchangedCount
|
|
report["withdrawn_decision_count"] = tracker.LastWithdrawnCount
|
|
report["total_applied_decision_count"] = tracker.TotalAppliedCount
|
|
report["total_withdrawn_decision_count"] = tracker.TotalWithdrawnCount
|
|
report["generation_changed"] = tracker.PreviousGeneration != tracker.Generation
|
|
report["active_decisions"] = routeGenerationDecisionDetails(tracker.activeList(), maxMeshRendezvousLeaseReportEntries)
|
|
report["withdrawn_decisions"] = routeGenerationDecisionDetails(tracker.Withdrawn, maxMeshRendezvousLeaseReportEntries)
|
|
report["truncated"] = len(tracker.Active) > maxMeshRendezvousLeaseReportEntries || len(tracker.Withdrawn) > maxMeshRendezvousLeaseReportEntries
|
|
return report
|
|
}
|
|
|
|
func (t *meshRouteGenerationTracker) activeList() []meshRouteGenerationDecisionState {
|
|
if t == nil {
|
|
return nil
|
|
}
|
|
out := make([]meshRouteGenerationDecisionState, 0, len(t.Active))
|
|
for _, state := range t.Active {
|
|
out = append(out, state)
|
|
}
|
|
sort.SliceStable(out, func(i, j int) bool {
|
|
if out[i].RouteID != out[j].RouteID {
|
|
return out[i].RouteID < out[j].RouteID
|
|
}
|
|
return out[i].DecisionID < out[j].DecisionID
|
|
})
|
|
return out
|
|
}
|
|
|
|
func routeGenerationDecisionDetails(states []meshRouteGenerationDecisionState, limit int) []map[string]any {
|
|
out := make([]map[string]any, 0, minInt(len(states), limit))
|
|
for index, state := range states {
|
|
if index >= limit {
|
|
break
|
|
}
|
|
out = append(out, map[string]any{
|
|
"decision_id": state.DecisionID,
|
|
"route_id": state.RouteID,
|
|
"generation": state.Generation,
|
|
"decision_source": state.DecisionSource,
|
|
"local_role": state.LocalRole,
|
|
"previous_hop_id": state.PreviousHopID,
|
|
"next_hop_id": state.NextHopID,
|
|
"selected_relay_id": state.SelectedRelayID,
|
|
"stale_relay_node_id": state.StaleRelayNodeID,
|
|
"rendezvous_lease_id": state.RendezvousLeaseID,
|
|
"effective_hops": append([]string{}, state.EffectiveHops...),
|
|
"original_hops": append([]string{}, state.OriginalHops...),
|
|
"path_score": state.PathScore,
|
|
"status": state.Status,
|
|
"apply_status": state.ApplyStatus,
|
|
"withdraw_status": state.WithdrawStatus,
|
|
"applied_at": formatOptionalTime(state.AppliedAt),
|
|
"withdrawn_at": formatOptionalTime(state.WithdrawnAt),
|
|
"control_plane_only": state.ControlPlaneOnly,
|
|
"production_forwarding": state.ProductionForwarding,
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func meshRouteHealthConfigReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any {
|
|
observedAt = observedAt.UTC()
|
|
report := map[string]any{
|
|
"schema_version": meshRouteHealthConfigReportSchema,
|
|
"cluster_id": identity.ClusterID,
|
|
"node_id": identity.NodeID,
|
|
"config_source": "",
|
|
"config_version": "",
|
|
"route_health_config_contract": "control_plane_route_path_decisions_to_synthetic_route_health",
|
|
"control_plane_only": true,
|
|
"route_health_only": true,
|
|
"synthetic_route_health_route_path_runtime": true,
|
|
"production_route_path_forwarding_runtime": false,
|
|
"production_payload_forwarding": false,
|
|
"service_workload_traffic": false,
|
|
"test_service_route_config_changed": false,
|
|
"observed_at": observedAt.Format(time.RFC3339Nano),
|
|
"config_refresh_interval_ms": int64(meshSyntheticConfigRefreshInterval / time.Millisecond),
|
|
"feedback_refresh_backoff_ms": int64(meshRouteHealthFeedbackRefreshBackoff / time.Millisecond),
|
|
"base_route_count": 0,
|
|
"route_health_route_count": 0,
|
|
"route_path_decision_applied_count": 0,
|
|
"replacement_route_health_route_count": 0,
|
|
"route_health_decision_drift_candidate_count": 0,
|
|
"routes": []map[string]any{},
|
|
}
|
|
if meshState == nil {
|
|
return report
|
|
}
|
|
report["config_source"] = meshState.Source
|
|
report["config_version"] = meshState.ConfigVersion
|
|
if !meshState.LastConfigRefreshAt.IsZero() {
|
|
report["last_config_refresh_at"] = meshState.LastConfigRefreshAt.UTC().Format(time.RFC3339Nano)
|
|
}
|
|
report["base_route_count"] = len(meshState.Routes)
|
|
routes := meshState.RouteHealthRoutes
|
|
if len(routes) == 0 {
|
|
routes = meshState.Routes
|
|
}
|
|
report["route_health_route_count"] = len(routes)
|
|
decisionsByRoute := routePathDecisionsByRoute(meshState.RoutePathDecisions)
|
|
applied := 0
|
|
replacements := 0
|
|
driftCandidates := 0
|
|
details := make([]map[string]any, 0, minInt(len(routes), maxMeshRendezvousLeaseReportEntries))
|
|
for index, route := range routes {
|
|
decision, ok := decisionsByRoute[route.RouteID]
|
|
if ok {
|
|
applied++
|
|
if decision.DecisionSource == "stale_relay_replacement" {
|
|
replacements++
|
|
}
|
|
if !sameStringSlice(route.Hops, decision.EffectiveHops) {
|
|
driftCandidates++
|
|
}
|
|
}
|
|
if index >= maxMeshRendezvousLeaseReportEntries {
|
|
continue
|
|
}
|
|
item := map[string]any{
|
|
"route_id": route.RouteID,
|
|
"source_node_id": route.SourceNodeID,
|
|
"destination_node_id": route.DestinationNodeID,
|
|
"effective_hops": append([]string{}, route.Hops...),
|
|
"route_version": route.RouteVersion,
|
|
"policy_version": route.PolicyVersion,
|
|
"peer_directory_version": route.PeerDirectoryVersion,
|
|
"route_path_decision_applied": ok,
|
|
}
|
|
if ok {
|
|
item["route_path_decision_id"] = decision.DecisionID
|
|
item["route_path_decision_generation"] = decision.Generation
|
|
item["route_path_decision_source"] = decision.DecisionSource
|
|
item["selected_relay_id"] = decision.SelectedRelayID
|
|
item["stale_relay_node_id"] = decision.StaleRelayNodeID
|
|
item["next_hop_id"] = decision.NextHopID
|
|
item["original_hops"] = append([]string{}, decision.OriginalHops...)
|
|
}
|
|
details = append(details, item)
|
|
}
|
|
report["route_path_decision_applied_count"] = applied
|
|
report["replacement_route_health_route_count"] = replacements
|
|
report["route_health_decision_drift_candidate_count"] = driftCandidates
|
|
report["feedback_refresh_attempt_count"] = meshState.RouteHealthRefreshAttempts
|
|
report["feedback_refresh_success_count"] = meshState.RouteHealthRefreshSuccesses
|
|
report["feedback_refresh_failure_count"] = meshState.RouteHealthRefreshFailures
|
|
report["feedback_refresh_suppressed_count"] = meshState.RouteHealthRefreshSuppressed
|
|
report["routes"] = details
|
|
report["truncated"] = len(routes) > maxMeshRendezvousLeaseReportEntries
|
|
return report
|
|
}
|
|
|
|
func meshRouteHealthFeedbackRefreshReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any {
|
|
observedAt = observedAt.UTC()
|
|
report := map[string]any{
|
|
"schema_version": meshRouteHealthFeedbackRefreshSchema,
|
|
"cluster_id": identity.ClusterID,
|
|
"node_id": identity.NodeID,
|
|
"refresh_contract": "route_health_feedback_to_node_scoped_synthetic_config_get",
|
|
"control_plane_only": true,
|
|
"route_health_only": true,
|
|
"production_payload_forwarding": false,
|
|
"service_workload_traffic": false,
|
|
"feedback_refresh_backoff_ms": int64(meshRouteHealthFeedbackRefreshBackoff / time.Millisecond),
|
|
"feedback_refresh_supported": false,
|
|
"feedback_refresh_attempt_count": 0,
|
|
"feedback_refresh_success_count": 0,
|
|
"feedback_refresh_failure_count": 0,
|
|
"feedback_refresh_suppressed_count": 0,
|
|
"last_feedback_refresh_status": "",
|
|
"last_feedback_refresh_reason": "",
|
|
"last_feedback_refresh_error": "",
|
|
"last_feedback_refresh_route_id": "",
|
|
"last_feedback_refresh_peer_node_id": "",
|
|
"last_feedback_refresh_selected_relay": "",
|
|
"observed_at": observedAt.Format(time.RFC3339Nano),
|
|
}
|
|
if meshState == nil {
|
|
return report
|
|
}
|
|
report["config_source"] = meshState.Source
|
|
report["config_version"] = meshState.ConfigVersion
|
|
report["feedback_refresh_supported"] = meshState.Source == "control_plane"
|
|
report["feedback_refresh_attempt_count"] = meshState.RouteHealthRefreshAttempts
|
|
report["feedback_refresh_success_count"] = meshState.RouteHealthRefreshSuccesses
|
|
report["feedback_refresh_failure_count"] = meshState.RouteHealthRefreshFailures
|
|
report["feedback_refresh_suppressed_count"] = meshState.RouteHealthRefreshSuppressed
|
|
if meshState.LastRouteHealthRefresh == nil {
|
|
return report
|
|
}
|
|
last := meshState.LastRouteHealthRefresh
|
|
report["last_feedback_refresh_status"] = last.Status
|
|
report["last_feedback_refresh_reason"] = last.Reason
|
|
report["last_feedback_refresh_error"] = last.Error
|
|
report["last_feedback_refresh_route_id"] = last.RouteID
|
|
report["last_feedback_refresh_peer_node_id"] = last.PeerNodeID
|
|
report["last_feedback_refresh_selected_relay"] = last.SelectedRelayID
|
|
report["last_feedback_refresh_link_status"] = last.LinkStatus
|
|
report["last_feedback_refresh_failure_reason"] = last.FailureReason
|
|
report["last_feedback_refresh_drift_detected"] = last.DriftDetected
|
|
report["last_feedback_refresh_attempted_at"] = formatOptionalTime(last.AttemptedAt)
|
|
report["last_feedback_refresh_completed_at"] = formatOptionalTime(last.CompletedAt)
|
|
report["last_feedback_refresh_previous_config_version"] = last.PreviousConfigVersion
|
|
report["last_feedback_refresh_refreshed_config_version"] = last.RefreshedConfigVersion
|
|
report["last_feedback_refresh_previous_route_health_route_count"] = last.PreviousRouteHealthRouteCount
|
|
report["last_feedback_refresh_refreshed_route_health_route_count"] = last.RefreshedRouteHealthRouteCount
|
|
return report
|
|
}
|
|
|
|
func meshRoutePathDecisionReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any {
|
|
observedAt = observedAt.UTC()
|
|
report := map[string]any{
|
|
"schema_version": meshRoutePathDecisionReportSchema,
|
|
"cluster_id": identity.ClusterID,
|
|
"node_id": identity.NodeID,
|
|
"config_source": "",
|
|
"config_version": "",
|
|
"decision_contract": "control_plane_route_path_decisions",
|
|
"control_plane_only": true,
|
|
"production_payload_forwarding": false,
|
|
"service_workload_traffic": false,
|
|
"route_path_forwarding_runtime": false,
|
|
"observed_at": observedAt.Format(time.RFC3339Nano),
|
|
"decision_count": 0,
|
|
"replacement_decision_count": 0,
|
|
"local_effective_path_count": 0,
|
|
"withdrawn_local_relay_count": 0,
|
|
"selected_local_relay_count": 0,
|
|
"next_hop_available_count": 0,
|
|
"decisions": []map[string]any{},
|
|
}
|
|
if meshState == nil {
|
|
return report
|
|
}
|
|
report["config_source"] = meshState.Source
|
|
report["config_version"] = meshState.ConfigVersion
|
|
decisionReport := meshState.RoutePathDecisions
|
|
if decisionReport == nil {
|
|
return report
|
|
}
|
|
report["control_plane_schema_version"] = decisionReport.SchemaVersion
|
|
report["decision_mode"] = decisionReport.DecisionMode
|
|
report["generation"] = decisionReport.Generation
|
|
report["decision_count"] = decisionReport.DecisionCount
|
|
report["replacement_decision_count"] = decisionReport.ReplacementDecisionCount
|
|
report["degraded_decision_count"] = decisionReport.DegradedDecisionCount
|
|
report["rebuild_request_count"] = decisionReport.RebuildRequestCount
|
|
report["rebuild_applied_count"] = decisionReport.RebuildAppliedCount
|
|
report["control_plane_report_only"] = decisionReport.ControlPlaneOnly
|
|
report["control_plane_report_production_forwarding"] = decisionReport.ProductionForwarding
|
|
decisions := make([]map[string]any, 0, minInt(len(decisionReport.Decisions), maxMeshRendezvousLeaseReportEntries))
|
|
localEffective := 0
|
|
withdrawnLocalRelay := 0
|
|
selectedLocalRelay := 0
|
|
nextHopAvailable := 0
|
|
for index, decision := range decisionReport.Decisions {
|
|
if containsString(decision.EffectiveHops, identity.NodeID) {
|
|
localEffective++
|
|
}
|
|
if decision.LocalRole == "withdrawn_relay" {
|
|
withdrawnLocalRelay++
|
|
}
|
|
if decision.LocalRole == "selected_relay" {
|
|
selectedLocalRelay++
|
|
}
|
|
if strings.TrimSpace(decision.NextHopID) != "" {
|
|
nextHopAvailable++
|
|
}
|
|
if index >= maxMeshRendezvousLeaseReportEntries {
|
|
continue
|
|
}
|
|
decisions = append(decisions, map[string]any{
|
|
"decision_id": decision.DecisionID,
|
|
"route_id": decision.RouteID,
|
|
"replacement_route_id": decision.ReplacementRouteID,
|
|
"rebuild_request_id": decision.RebuildRequestID,
|
|
"rebuild_status": decision.RebuildStatus,
|
|
"rebuild_reason": decision.RebuildReason,
|
|
"rebuild_attempt": decision.RebuildAttempt,
|
|
"source_node_id": decision.SourceNodeID,
|
|
"destination_node_id": decision.DestinationNodeID,
|
|
"original_hops": append([]string{}, decision.OriginalHops...),
|
|
"effective_hops": append([]string{}, decision.EffectiveHops...),
|
|
"previous_hop_id": decision.PreviousHopID,
|
|
"next_hop_id": decision.NextHopID,
|
|
"local_role": decision.LocalRole,
|
|
"selected_relay_id": decision.SelectedRelayID,
|
|
"selected_relay_endpoint": decision.SelectedRelayEndpoint,
|
|
"stale_relay_node_id": decision.StaleRelayNodeID,
|
|
"rendezvous_peer_node_id": decision.RendezvousPeerNodeID,
|
|
"rendezvous_lease_id": decision.RendezvousLeaseID,
|
|
"rendezvous_lease_reason": decision.RendezvousLeaseReason,
|
|
"decision_source": decision.DecisionSource,
|
|
"generation": decision.Generation,
|
|
"path_score": decision.PathScore,
|
|
"score_reasons": append([]string{}, decision.ScoreReasons...),
|
|
"control_plane_only": decision.ControlPlaneOnly,
|
|
"production_forwarding": decision.ProductionForwarding,
|
|
"expires_at": formatOptionalTime(decision.ExpiresAt),
|
|
})
|
|
}
|
|
report["local_effective_path_count"] = localEffective
|
|
report["withdrawn_local_relay_count"] = withdrawnLocalRelay
|
|
report["selected_local_relay_count"] = selectedLocalRelay
|
|
report["next_hop_available_count"] = nextHopAvailable
|
|
report["truncated"] = len(decisionReport.Decisions) > maxMeshRendezvousLeaseReportEntries
|
|
report["decisions"] = decisions
|
|
return report
|
|
}
|
|
|
|
func meshRendezvousLeaseBaseValid(lease mesh.PeerRendezvousLease) bool {
|
|
return strings.TrimSpace(lease.LeaseID) != "" &&
|
|
strings.TrimSpace(lease.PeerNodeID) != "" &&
|
|
strings.TrimSpace(lease.RelayNodeID) != "" &&
|
|
strings.TrimSpace(lease.RelayEndpoint) != "" &&
|
|
!lease.ExpiresAt.IsZero() &&
|
|
lease.ControlPlaneOnly
|
|
}
|
|
|
|
func meshRendezvousConnectionsByPeer(meshState *syntheticMeshState) map[string]mesh.PeerConnectionState {
|
|
out := map[string]mesh.PeerConnectionState{}
|
|
if meshState == nil || meshState.PeerConnections == nil {
|
|
return out
|
|
}
|
|
for _, entry := range meshState.PeerConnections.Snapshot().Entries {
|
|
if strings.TrimSpace(entry.NodeID) != "" {
|
|
out[entry.NodeID] = entry
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func meshRendezvousLeaseRenewalNeeded(lease mesh.PeerRendezvousLease, observedAt time.Time, usable bool) bool {
|
|
if !usable {
|
|
return false
|
|
}
|
|
ttlRemaining := lease.ExpiresAt.Sub(observedAt)
|
|
if ttlRemaining <= meshRendezvousLeaseRenewalWindow {
|
|
return true
|
|
}
|
|
renewalAfter := meshRendezvousLeaseRenewalAfter(lease)
|
|
return !renewalAfter.IsZero() && !renewalAfter.After(observedAt)
|
|
}
|
|
|
|
func meshRendezvousLeaseStaleRelay(lease mesh.PeerRendezvousLease, connection mesh.PeerConnectionState) bool {
|
|
if strings.TrimSpace(lease.LeaseID) == "" || strings.TrimSpace(connection.NodeID) == "" {
|
|
return false
|
|
}
|
|
if !meshRendezvousLeaseMatchesConnection(lease, connection) {
|
|
return false
|
|
}
|
|
switch connection.State {
|
|
case mesh.PeerConnectionBackoff:
|
|
return true
|
|
case mesh.PeerConnectionDegraded:
|
|
return connection.ConsecutiveFailures > 0
|
|
case mesh.PeerConnectionWaiting:
|
|
return connection.RendezvousLeaseID == lease.LeaseID && connection.LastFailureReason != ""
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func meshRendezvousLeaseMatchesConnection(lease mesh.PeerRendezvousLease, connection mesh.PeerConnectionState) bool {
|
|
if connection.RendezvousLeaseID != "" && connection.RendezvousLeaseID != lease.LeaseID {
|
|
return false
|
|
}
|
|
if connection.RelayNodeID != "" && connection.RelayNodeID != lease.RelayNodeID {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
func meshRendezvousLeaseRole(lease mesh.PeerRendezvousLease, localNodeID string) string {
|
|
localNodeID = strings.TrimSpace(localNodeID)
|
|
switch {
|
|
case localNodeID == "" || (lease.PeerNodeID != localNodeID && lease.RelayNodeID != localNodeID):
|
|
return "entry_or_observer"
|
|
case lease.PeerNodeID == localNodeID && lease.RelayNodeID == localNodeID:
|
|
return "self"
|
|
case lease.RelayNodeID == localNodeID:
|
|
return "relay"
|
|
case lease.PeerNodeID == localNodeID:
|
|
return "peer"
|
|
default:
|
|
return "entry_or_observer"
|
|
}
|
|
}
|
|
|
|
func meshRendezvousLeaseStatus(valid bool, expired bool, renewalNeeded bool, role string) string {
|
|
switch {
|
|
case !valid:
|
|
return "invalid"
|
|
case expired:
|
|
return "expired"
|
|
case renewalNeeded:
|
|
return "renewal_needed"
|
|
case role == "relay":
|
|
return "admitted"
|
|
default:
|
|
return "active"
|
|
}
|
|
}
|
|
|
|
func meshRendezvousLeaseRenewalAfter(lease mesh.PeerRendezvousLease) time.Time {
|
|
if lease.ExpiresAt.IsZero() {
|
|
return time.Time{}
|
|
}
|
|
if lease.IssuedAt.IsZero() || !lease.ExpiresAt.After(lease.IssuedAt) {
|
|
return lease.ExpiresAt.Add(-meshRendezvousLeaseRenewalWindow).UTC()
|
|
}
|
|
ttl := lease.ExpiresAt.Sub(lease.IssuedAt)
|
|
return lease.IssuedAt.Add(ttl * 2 / 3).UTC()
|
|
}
|
|
|
|
func formatOptionalTime(value time.Time) string {
|
|
if value.IsZero() {
|
|
return ""
|
|
}
|
|
return value.UTC().Format(time.RFC3339Nano)
|
|
}
|
|
|
|
func advertisedEndpointCandidates(cfg config.Config, identity state.Identity, meshState *syntheticMeshState, observedAt time.Time) ([]mesh.PeerEndpointCandidate, error) {
|
|
var candidates []mesh.PeerEndpointCandidate
|
|
if cfg.MeshAdvertiseEndpointsJSON != "" {
|
|
if err := json.Unmarshal([]byte(cfg.MeshAdvertiseEndpointsJSON), &candidates); err != nil {
|
|
return nil, fmt.Errorf("parse RAP_MESH_ADVERTISE_ENDPOINTS_JSON: %w", err)
|
|
}
|
|
}
|
|
if cfg.MeshAdvertiseEndpoint != "" {
|
|
candidates = append(candidates, mesh.PeerEndpointCandidate{
|
|
EndpointID: identity.NodeID + "-advertised",
|
|
NodeID: identity.NodeID,
|
|
Transport: cfg.MeshAdvertiseTransport,
|
|
Address: cfg.MeshAdvertiseEndpoint,
|
|
Reachability: reachabilityFromConnectivityMode(cfg.MeshConnectivityMode),
|
|
NATType: cfg.MeshNATType,
|
|
ConnectivityMode: cfg.MeshConnectivityMode,
|
|
Region: cfg.MeshRegion,
|
|
Priority: 10,
|
|
})
|
|
}
|
|
if cfg.MeshQUICFabricEnabled && meshState != nil && strings.TrimSpace(meshState.QUICFabricListenAddr) != "" {
|
|
candidates = append(candidates, mesh.PeerEndpointCandidate{
|
|
EndpointID: identity.NodeID + "-quic-fabric",
|
|
NodeID: identity.NodeID,
|
|
Transport: "direct_quic",
|
|
Address: "quic://" + meshState.QUICFabricListenAddr,
|
|
Reachability: reachabilityFromConnectivityMode(cfg.MeshConnectivityMode),
|
|
NATType: cfg.MeshNATType,
|
|
ConnectivityMode: cfg.MeshConnectivityMode,
|
|
Region: cfg.MeshRegion,
|
|
Priority: 5,
|
|
PolicyTags: []string{"fast-path"},
|
|
Metadata: quicFabricEndpointMetadata(meshState.QUICFabricCertSHA256),
|
|
})
|
|
}
|
|
candidates = append(candidates, interfaceEndpointCandidates(cfg, identity, meshState, observedAt)...)
|
|
for i := range candidates {
|
|
if candidates[i].EndpointID == "" {
|
|
candidates[i].EndpointID = fmt.Sprintf("%s-advertised-%d", identity.NodeID, i+1)
|
|
}
|
|
if candidates[i].NodeID == "" {
|
|
candidates[i].NodeID = identity.NodeID
|
|
}
|
|
if candidates[i].NodeID != identity.NodeID || strings.TrimSpace(candidates[i].Address) == "" {
|
|
return nil, fmt.Errorf("invalid advertised mesh endpoint candidate")
|
|
}
|
|
candidates[i].Address = strings.TrimRight(strings.TrimSpace(candidates[i].Address), "/")
|
|
if candidates[i].Transport == "" {
|
|
candidates[i].Transport = defaultString(cfg.MeshAdvertiseTransport, "direct_tcp_tls")
|
|
}
|
|
if candidates[i].ConnectivityMode == "" {
|
|
candidates[i].ConnectivityMode = defaultString(cfg.MeshConnectivityMode, "direct")
|
|
}
|
|
if candidates[i].Reachability == "" {
|
|
candidates[i].Reachability = reachabilityFromConnectivityMode(candidates[i].ConnectivityMode)
|
|
}
|
|
if candidates[i].NATType == "" {
|
|
candidates[i].NATType = defaultString(cfg.MeshNATType, "unknown")
|
|
}
|
|
if candidates[i].Region == "" {
|
|
candidates[i].Region = cfg.MeshRegion
|
|
}
|
|
if candidates[i].Priority <= 0 {
|
|
candidates[i].Priority = 10 + i
|
|
}
|
|
candidates[i].LastVerifiedAt = &observedAt
|
|
if candidates[i].Metadata == nil {
|
|
metadata, err := json.Marshal(map[string]any{
|
|
"source": "node-agent-heartbeat",
|
|
"runtime": "c17z7",
|
|
"synthetic_runtime": cfg.MeshSyntheticRuntimeEnabled,
|
|
"production_forwarding": cfg.MeshProductionForwardingEnabled,
|
|
"vpn_fabric_session": cfg.VPNFabricSessionTransportEnabled,
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
candidates[i].Metadata = metadata
|
|
}
|
|
}
|
|
sort.SliceStable(candidates, func(i, j int) bool {
|
|
if candidates[i].Priority == candidates[j].Priority {
|
|
return candidates[i].EndpointID < candidates[j].EndpointID
|
|
}
|
|
return candidates[i].Priority < candidates[j].Priority
|
|
})
|
|
return candidates, nil
|
|
}
|
|
|
|
func quicFabricEndpointMetadata(certSHA256 string) json.RawMessage {
|
|
certSHA256 = strings.TrimSpace(certSHA256)
|
|
if certSHA256 == "" {
|
|
return nil
|
|
}
|
|
payload, err := json.Marshal(map[string]string{"tls_cert_sha256": certSHA256})
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
return payload
|
|
}
|
|
|
|
func interfaceEndpointCandidates(cfg config.Config, identity state.Identity, meshState *syntheticMeshState, observedAt time.Time) []mesh.PeerEndpointCandidate {
|
|
if meshState == nil {
|
|
return nil
|
|
}
|
|
report := meshState.ListenerReport
|
|
if report.Status != "listening" && report.Status != "auto_rebound" {
|
|
return nil
|
|
}
|
|
if cfg.MeshConnectivityMode == "outbound_only" {
|
|
return nil
|
|
}
|
|
port := listenerPort(report.EffectiveListenAddr, report.ConfiguredListenAddr, cfg.MeshListenAddr)
|
|
if port == "" {
|
|
return nil
|
|
}
|
|
interfaces, err := net.Interfaces()
|
|
if err != nil {
|
|
log.Printf("mesh interface discovery skipped: %v", err)
|
|
return nil
|
|
}
|
|
var candidates []mesh.PeerEndpointCandidate
|
|
for _, iface := range interfaces {
|
|
if iface.Flags&net.FlagUp == 0 || iface.Flags&net.FlagLoopback != 0 {
|
|
continue
|
|
}
|
|
interfaceType := classifyNetworkInterface(iface.Name)
|
|
if interfaceType == "container" {
|
|
continue
|
|
}
|
|
addrs, err := iface.Addrs()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
for _, addr := range addrs {
|
|
ip := ipFromAddr(addr)
|
|
if ip == nil || ip.IsLoopback() || ip.IsUnspecified() || ip.IsMulticast() || ip.IsLinkLocalMulticast() || ip.IsLinkLocalUnicast() {
|
|
continue
|
|
}
|
|
addressFamily := "ipv6"
|
|
if ip.To4() != nil {
|
|
addressFamily = "ipv4"
|
|
}
|
|
reachability := "public"
|
|
connectivityMode := defaultString(cfg.MeshConnectivityMode, "direct")
|
|
if ip.IsPrivate() || ip.IsLinkLocalUnicast() {
|
|
reachability = "private"
|
|
if connectivityMode == "direct" {
|
|
connectivityMode = "private_lan"
|
|
}
|
|
}
|
|
metadata, _ := json.Marshal(map[string]any{
|
|
"source": "node-agent-interface-discovery",
|
|
"runtime": "c17z24",
|
|
"interface_name": iface.Name,
|
|
"interface_index": iface.Index,
|
|
"interface_type": interfaceType,
|
|
"listen_effective_addr": report.EffectiveListenAddr,
|
|
"listen_configured_addr": report.ConfiguredListenAddr,
|
|
"loopback_filtered": true,
|
|
"link_local_filtered": true,
|
|
"container_iface_filtered": true,
|
|
"operator_override_allowed": true,
|
|
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
|
|
})
|
|
candidates = append(candidates, mesh.PeerEndpointCandidate{
|
|
EndpointID: fmt.Sprintf("%s-if-%s-%s-%s", identity.NodeID, safeEndpointIDPart(iface.Name), safeEndpointIDPart(ip.String()), addressFamily),
|
|
NodeID: identity.NodeID,
|
|
Transport: defaultString(cfg.MeshAdvertiseTransport, "direct_http"),
|
|
Address: endpointAddress(defaultString(cfg.MeshAdvertiseTransport, "direct_http"), ip, port),
|
|
AddressFamily: addressFamily,
|
|
Reachability: reachability,
|
|
NATType: defaultString(cfg.MeshNATType, "unknown"),
|
|
ConnectivityMode: connectivityMode,
|
|
Region: cfg.MeshRegion,
|
|
Priority: endpointPriority(reachability, addressFamily, interfaceType, len(candidates)),
|
|
PolicyTags: []string{"auto_discovered", "non_loopback", interfaceType},
|
|
LastVerifiedAt: &observedAt,
|
|
Metadata: metadata,
|
|
})
|
|
}
|
|
}
|
|
return candidates
|
|
}
|
|
|
|
func classifyNetworkInterface(name string) string {
|
|
normalized := strings.ToLower(strings.TrimSpace(name))
|
|
switch {
|
|
case strings.HasPrefix(normalized, "docker"),
|
|
strings.HasPrefix(normalized, "br-"),
|
|
strings.HasPrefix(normalized, "veth"),
|
|
strings.HasPrefix(normalized, "virbr"),
|
|
strings.HasPrefix(normalized, "cni"),
|
|
strings.HasPrefix(normalized, "flannel"),
|
|
strings.HasPrefix(normalized, "calico"),
|
|
strings.HasPrefix(normalized, "kube"):
|
|
return "container"
|
|
case strings.HasPrefix(normalized, "tun"),
|
|
strings.HasPrefix(normalized, "tap"),
|
|
strings.HasPrefix(normalized, "wg"),
|
|
strings.Contains(normalized, "tailscale"),
|
|
strings.Contains(normalized, "zerotier"),
|
|
strings.HasPrefix(normalized, "zt"):
|
|
return "vpn"
|
|
case strings.HasPrefix(normalized, "eth"),
|
|
strings.HasPrefix(normalized, "ens"),
|
|
strings.HasPrefix(normalized, "eno"),
|
|
strings.HasPrefix(normalized, "enp"),
|
|
strings.HasPrefix(normalized, "wlan"),
|
|
strings.HasPrefix(normalized, "wl"),
|
|
strings.HasPrefix(normalized, "bond"):
|
|
return "physical"
|
|
default:
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
func listenerPort(addrs ...string) string {
|
|
for _, addr := range addrs {
|
|
addr = strings.TrimSpace(addr)
|
|
if addr == "" {
|
|
continue
|
|
}
|
|
_, port, err := net.SplitHostPort(addr)
|
|
if err == nil && port != "" {
|
|
return port
|
|
}
|
|
if strings.HasPrefix(addr, ":") && len(addr) > 1 {
|
|
return strings.TrimPrefix(addr, ":")
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func ipFromAddr(addr net.Addr) net.IP {
|
|
switch v := addr.(type) {
|
|
case *net.IPNet:
|
|
return v.IP
|
|
case *net.IPAddr:
|
|
return v.IP
|
|
default:
|
|
host, _, err := net.SplitHostPort(v.String())
|
|
if err != nil {
|
|
host = v.String()
|
|
}
|
|
return net.ParseIP(host)
|
|
}
|
|
}
|
|
|
|
func endpointAddress(transport string, ip net.IP, port string) string {
|
|
host := ip.String()
|
|
if ip.To4() == nil {
|
|
host = "[" + host + "]"
|
|
}
|
|
scheme := "http"
|
|
switch strings.ToLower(strings.TrimSpace(transport)) {
|
|
case "wss":
|
|
scheme = "wss"
|
|
case "https", "direct_https":
|
|
scheme = "https"
|
|
}
|
|
return scheme + "://" + host + ":" + port
|
|
}
|
|
|
|
func endpointPriority(reachability string, addressFamily string, interfaceType string, offset int) int {
|
|
base := 40
|
|
if reachability == "public" {
|
|
base = 20
|
|
} else if reachability == "private" {
|
|
base = 30
|
|
}
|
|
switch interfaceType {
|
|
case "vpn":
|
|
base += 0
|
|
case "physical":
|
|
base += 5
|
|
default:
|
|
base += 10
|
|
}
|
|
if addressFamily == "ipv6" {
|
|
base += 20
|
|
}
|
|
return base + offset
|
|
}
|
|
|
|
func safeEndpointIDPart(value string) string {
|
|
value = strings.ToLower(strings.TrimSpace(value))
|
|
var out strings.Builder
|
|
lastDash := false
|
|
for _, r := range value {
|
|
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') {
|
|
out.WriteRune(r)
|
|
lastDash = false
|
|
} else if !lastDash {
|
|
out.WriteByte('-')
|
|
lastDash = true
|
|
}
|
|
}
|
|
result := strings.Trim(out.String(), "-")
|
|
if result == "" {
|
|
return "iface"
|
|
}
|
|
return result
|
|
}
|
|
|
|
func defaultString(value string, fallback string) string {
|
|
if strings.TrimSpace(value) == "" {
|
|
return fallback
|
|
}
|
|
return value
|
|
}
|
|
|
|
func containsString(items []string, value string) bool {
|
|
value = strings.TrimSpace(value)
|
|
for _, item := range items {
|
|
if strings.TrimSpace(item) == value {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func sameStringSlice(left []string, right []string) bool {
|
|
if len(left) != len(right) {
|
|
return false
|
|
}
|
|
for index := range left {
|
|
if strings.TrimSpace(left[index]) != strings.TrimSpace(right[index]) {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
func sameStringMap(left map[string]string, right map[string]string) bool {
|
|
if len(left) != len(right) {
|
|
return false
|
|
}
|
|
for key, leftValue := range left {
|
|
if right[key] != leftValue {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
func copyStringMap(values map[string]string) map[string]string {
|
|
if len(values) == 0 {
|
|
return map[string]string{}
|
|
}
|
|
out := make(map[string]string, len(values))
|
|
for key, value := range values {
|
|
out[key] = value
|
|
}
|
|
return out
|
|
}
|
|
|
|
func samePeerEndpointCandidatesMap(left map[string][]mesh.PeerEndpointCandidate, right map[string][]mesh.PeerEndpointCandidate) bool {
|
|
if len(left) != len(right) {
|
|
return false
|
|
}
|
|
for key, leftValues := range left {
|
|
rightValues, ok := right[key]
|
|
if !ok || len(leftValues) != len(rightValues) {
|
|
return false
|
|
}
|
|
for index := range leftValues {
|
|
if leftValues[index].EndpointID != rightValues[index].EndpointID ||
|
|
leftValues[index].Transport != rightValues[index].Transport ||
|
|
leftValues[index].Address != rightValues[index].Address ||
|
|
leftValues[index].Priority != rightValues[index].Priority {
|
|
return false
|
|
}
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
func copyPeerEndpointCandidatesMap(values map[string][]mesh.PeerEndpointCandidate) map[string][]mesh.PeerEndpointCandidate {
|
|
if len(values) == 0 {
|
|
return map[string][]mesh.PeerEndpointCandidate{}
|
|
}
|
|
out := make(map[string][]mesh.PeerEndpointCandidate, len(values))
|
|
for nodeID, candidates := range values {
|
|
if len(candidates) == 0 {
|
|
out[nodeID] = nil
|
|
continue
|
|
}
|
|
out[nodeID] = append([]mesh.PeerEndpointCandidate(nil), candidates...)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func copyEndpointCandidateObservations(values map[string]mesh.EndpointCandidateHealthObservation) map[string]mesh.EndpointCandidateHealthObservation {
|
|
if len(values) == 0 {
|
|
return map[string]mesh.EndpointCandidateHealthObservation{}
|
|
}
|
|
out := make(map[string]mesh.EndpointCandidateHealthObservation, len(values))
|
|
for endpointID, observation := range values {
|
|
out[endpointID] = observation
|
|
}
|
|
return out
|
|
}
|
|
|
|
func minInt(left, right int) int {
|
|
if left < right {
|
|
return left
|
|
}
|
|
return right
|
|
}
|
|
|
|
func reachabilityFromConnectivityMode(connectivityMode string) string {
|
|
switch connectivityMode {
|
|
case "outbound_only":
|
|
return "outbound_only"
|
|
case "relay_required":
|
|
return "relay"
|
|
case "private_lan":
|
|
return "private"
|
|
case "direct":
|
|
return "public"
|
|
default:
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
func reportWorkloadStatus(ctx context.Context, api *client.Client, supervisor supervisor.Supervisor, identity state.Identity, meshState *syntheticMeshState) error {
|
|
desired, err := api.DesiredWorkloads(ctx, identity.ClusterID, identity.NodeID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
statuses, err := supervisor.Apply(ctx, desired)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
enrichWorkloadStatuses(statuses, desired, meshState)
|
|
for i, status := range statuses {
|
|
if i >= len(desired) {
|
|
break
|
|
}
|
|
if err := api.ReportWorkloadStatus(ctx, identity.ClusterID, identity.NodeID, desired[i].ServiceType, status); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if len(statuses) > 0 {
|
|
log.Printf("workload status reported: count=%d", len(statuses))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func enrichWorkloadStatuses(statuses []client.WorkloadStatusRequest, desired []client.DesiredWorkload, meshState *syntheticMeshState) {
|
|
if meshState == nil || meshState.RemoteWorkspaceFrameSink == nil {
|
|
return
|
|
}
|
|
sinkReport := meshState.RemoteWorkspaceFrameSink.Report(time.Now().UTC())
|
|
for i := range statuses {
|
|
if i >= len(desired) {
|
|
return
|
|
}
|
|
if strings.TrimSpace(desired[i].ServiceType) != "rdp-worker" {
|
|
continue
|
|
}
|
|
if statuses[i].StatusPayload == nil {
|
|
statuses[i].StatusPayload = map[string]any{}
|
|
}
|
|
statuses[i].StatusPayload["remote_workspace_adapter_sink"] = sinkReport
|
|
}
|
|
}
|
|
|
|
func reportVPNAssignmentStatus(ctx context.Context, api *client.Client, identity state.Identity, gateway *vpnruntime.Gateway) error {
|
|
assignments, err := api.NodeVPNAssignments(ctx, identity.ClusterID, identity.NodeID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for _, assignment := range assignments {
|
|
status := "lease_required"
|
|
reason := "eligible_candidate_waiting_for_active_lease"
|
|
runtimeAvailable := false
|
|
packetForwarding := false
|
|
runtimeError := ""
|
|
if assignment.ActiveLease != nil && assignment.ActiveLease.OwnerNodeID == identity.NodeID {
|
|
running, lastErr := gateway.Status()
|
|
runtimeAvailable = running
|
|
packetForwarding = running
|
|
runtimeError = lastErr
|
|
if running {
|
|
status = "assigned"
|
|
reason = "active_lease_owned_by_local_node"
|
|
} else {
|
|
status = "blocked"
|
|
reason = "vpn_gateway_runtime_unavailable"
|
|
if runtimeError == "" {
|
|
runtimeError = "vpn gateway runtime is not running"
|
|
}
|
|
}
|
|
}
|
|
if assignment.DesiredState != "enabled" {
|
|
status = "blocked"
|
|
reason = "vpn_connection_disabled"
|
|
}
|
|
payload := map[string]any{
|
|
"schema_version": "rap.node_vpn_assignment_status.v1",
|
|
"assignment_reason": assignment.AssignmentReason,
|
|
"protocol_family": assignment.ProtocolFamily,
|
|
"runtime_available": runtimeAvailable,
|
|
"packet_forwarding": packetForwarding,
|
|
"reason": reason,
|
|
"native_vpn_runtime_note": "experimental packet tunnel runtime is enabled for active linux gateway leases",
|
|
"gateway_interface": "rapvpn0",
|
|
"gateway_vpn_cidr": "10.77.0.0/24",
|
|
"relay_transport": "not_active_owner",
|
|
}
|
|
if dnsServers := vpnAssignmentDNSServers(assignment); len(dnsServers) > 0 {
|
|
payload["exit_dns_servers"] = dnsServers
|
|
}
|
|
if runtimeError != "" {
|
|
payload["runtime_error"] = runtimeError
|
|
}
|
|
if assignment.ActiveLease != nil && assignment.ActiveLease.OwnerNodeID == identity.NodeID {
|
|
gatewayRuntime := gateway.Snapshot()
|
|
payload["gateway_runtime"] = gatewayRuntime
|
|
if transport, ok := gatewayRuntime["transport"].(string); ok && strings.TrimSpace(transport) != "" {
|
|
payload["relay_transport"] = transport
|
|
}
|
|
}
|
|
if assignment.ActiveLease != nil {
|
|
payload["active_lease_id"] = assignment.ActiveLease.LeaseID
|
|
payload["lease_generation"] = assignment.ActiveLease.LeaseGeneration
|
|
payload["lease_expires_at"] = assignment.ActiveLease.ExpiresAt
|
|
}
|
|
if err := api.ReportNodeVPNAssignmentStatus(ctx, identity.ClusterID, identity.NodeID, assignment.VPNConnectionID, client.NodeVPNAssignmentStatusRequest{
|
|
ObservedStatus: status,
|
|
StatusPayload: payload,
|
|
ObservedAt: time.Now().UTC(),
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if len(assignments) > 0 {
|
|
log.Printf("vpn assignment status reported: count=%d", len(assignments))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func exitDNSServers() []string {
|
|
if configured := parseDNSServerList(os.Getenv("RAP_VPN_EXIT_DNS_SERVERS")); len(configured) > 0 {
|
|
return configured
|
|
}
|
|
if configured := parseDNSServerList(os.Getenv("RAP_EXIT_DNS_SERVERS")); len(configured) > 0 {
|
|
return configured
|
|
}
|
|
if runtime.GOOS == "windows" {
|
|
return windowsExitDNSServers()
|
|
}
|
|
seen := map[string]bool{}
|
|
var out []string
|
|
for _, path := range []string{
|
|
"/run/systemd/resolve/resolv.conf",
|
|
"/etc/resolv.conf",
|
|
"/run/systemd/resolve/stub-resolv.conf",
|
|
} {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
for _, line := range strings.Split(string(data), "\n") {
|
|
fields := strings.Fields(line)
|
|
if len(fields) < 2 || fields[0] != "nameserver" {
|
|
continue
|
|
}
|
|
server := strings.TrimSpace(fields[1])
|
|
ip := net.ParseIP(server)
|
|
if ip == nil || ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() {
|
|
continue
|
|
}
|
|
if seen[server] {
|
|
continue
|
|
}
|
|
seen[server] = true
|
|
out = append(out, server)
|
|
}
|
|
if len(out) > 0 {
|
|
break
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func vpnAssignmentDNSServers(assignment client.NodeVPNAssignment) []string {
|
|
if servers := exitDNSServers(); len(servers) > 0 {
|
|
return servers
|
|
}
|
|
for _, raw := range []json.RawMessage{assignment.RoutePolicy, assignment.TargetEndpoint} {
|
|
if servers := dnsServersFromRawPolicy(raw); len(servers) > 0 {
|
|
return servers
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func windowsExitDNSServers() []string {
|
|
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
|
|
defer cancel()
|
|
output, err := exec.CommandContext(ctx, "netsh", "interface", "ip", "show", "dnsservers").CombinedOutput()
|
|
if err != nil || len(output) == 0 {
|
|
return nil
|
|
}
|
|
return parseDNSServerList(string(output))
|
|
}
|
|
|
|
func dnsServersFromRawPolicy(raw json.RawMessage) []string {
|
|
var payload map[string]json.RawMessage
|
|
if len(raw) == 0 || json.Unmarshal(raw, &payload) != nil {
|
|
return nil
|
|
}
|
|
for _, key := range []string{"dns_servers", "exit_dns_servers"} {
|
|
var values []string
|
|
if item, ok := payload[key]; ok && json.Unmarshal(item, &values) == nil {
|
|
if servers := normalizeDNSServers(values); len(servers) > 0 {
|
|
return servers
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func normalizeDNSServers(values []string) []string {
|
|
seen := map[string]bool{}
|
|
out := make([]string, 0, len(values))
|
|
for _, value := range values {
|
|
server := strings.TrimSpace(value)
|
|
ip := net.ParseIP(server)
|
|
if ip == nil || ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() || seen[server] {
|
|
continue
|
|
}
|
|
seen[server] = true
|
|
out = append(out, server)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func parseDNSServerList(value string) []string {
|
|
seen := map[string]bool{}
|
|
var out []string
|
|
for _, field := range strings.FieldsFunc(value, func(r rune) bool {
|
|
return r == ',' || r == ';' || r == ' ' || r == '\t' || r == '\n' || r == '\r'
|
|
}) {
|
|
server := strings.TrimSpace(field)
|
|
ip := net.ParseIP(server)
|
|
if ip == nil || ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() || seen[server] {
|
|
continue
|
|
}
|
|
seen[server] = true
|
|
out = append(out, server)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func ensureVPNGatewayRuntime(ctx context.Context, api *client.Client, cfg config.Config, identity state.Identity, gateway *vpnruntime.Gateway, meshState *syntheticMeshState) error {
|
|
assignments, err := api.NodeVPNAssignments(ctx, identity.ClusterID, identity.NodeID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
activeOwner := false
|
|
for _, assignment := range assignments {
|
|
if assignment.AssignmentReason == "eligible_candidate" && assignment.DesiredState == "enabled" {
|
|
if !vpnAssignmentLeaseAutoAcquireAllowed(identity.NodeID, assignment) {
|
|
log.Printf("vpn assignment lease auto-acquire skipped: vpn_connection_id=%s reason=local_node_is_not_selected_exit", assignment.VPNConnectionID)
|
|
continue
|
|
}
|
|
lease, err := api.AcquireNodeVPNAssignmentLease(ctx, identity.ClusterID, identity.NodeID, assignment.VPNConnectionID, client.NodeVPNAssignmentLeaseAcquireRequest{
|
|
TTLSeconds: 300,
|
|
Metadata: map[string]any{
|
|
"reason": "node_agent_auto_acquire",
|
|
"node_id": identity.NodeID,
|
|
"agent": "rap-node-agent",
|
|
"acquired_at": time.Now().UTC().Format(time.RFC3339Nano),
|
|
},
|
|
})
|
|
if err != nil {
|
|
log.Printf("vpn assignment lease auto-acquire skipped: vpn_connection_id=%s error=%v", assignment.VPNConnectionID, err)
|
|
} else if lease != nil {
|
|
assignment.AssignmentReason = "active_owner"
|
|
assignment.ActiveLease = lease
|
|
log.Printf("vpn assignment lease auto-acquired: vpn_connection_id=%s lease_id=%s", assignment.VPNConnectionID, lease.LeaseID)
|
|
}
|
|
}
|
|
if assignment.AssignmentReason != "active_owner" {
|
|
continue
|
|
}
|
|
if assignment.ActiveLease == nil || assignment.ActiveLease.OwnerNodeID != identity.NodeID {
|
|
continue
|
|
}
|
|
activeOwner = true
|
|
gateway.ClusterID = identity.ClusterID
|
|
gateway.VPNConnectionID = assignment.VPNConnectionID
|
|
gateway.InterfaceName = "rapvpn0"
|
|
gateway.AddressCIDR = "10.77.0.1/24"
|
|
gateway.RouteCIDR = "10.77.0.0/24"
|
|
gateway.PollTimeout = 25 * time.Second
|
|
if transport := fabricGatewayTransportForAssignment(ctx, cfg, identity, assignment, meshState, api); transport != nil {
|
|
if _, ok := gateway.Transport.(vpnruntime.BackendPacketTransport); ok {
|
|
gateway.Stop()
|
|
}
|
|
gateway.Transport = transport
|
|
} else if transport := localGatewayTransportForAssignment(identity, assignment, meshState, api); transport != nil {
|
|
if _, ok := gateway.Transport.(vpnruntime.BackendPacketTransport); ok {
|
|
gateway.Stop()
|
|
}
|
|
gateway.Transport = transport
|
|
} else if _, ok := gateway.Transport.(*vpnruntime.FabricPacketTransport); ok {
|
|
gateway.Stop()
|
|
gateway.Transport = nil
|
|
} else if _, ok := gateway.Transport.(*vpnruntime.AdaptivePacketTransport); ok {
|
|
gateway.Stop()
|
|
gateway.Transport = nil
|
|
} else {
|
|
gateway.Stop()
|
|
gateway.Transport = nil
|
|
log.Printf("vpn gateway runtime skipped: vpn_connection_id=%s reason=fabric_packet_transport_unavailable", assignment.VPNConnectionID)
|
|
return nil
|
|
}
|
|
if err := gateway.EnsureStarted(ctx); err != nil {
|
|
return err
|
|
}
|
|
if err := renewOwnedVPNLease(ctx, api, identity, assignment); err != nil {
|
|
return err
|
|
}
|
|
log.Printf("vpn gateway runtime ensured: vpn_connection_id=%s interface=%s", assignment.VPNConnectionID, gateway.InterfaceName)
|
|
return nil
|
|
}
|
|
if !activeOwner {
|
|
gateway.Stop()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func vpnAssignmentLeaseAutoAcquireAllowed(localNodeID string, assignment client.NodeVPNAssignment) bool {
|
|
localNodeID = strings.TrimSpace(localNodeID)
|
|
if localNodeID == "" {
|
|
return false
|
|
}
|
|
var policy struct {
|
|
ExitNodeID string `json:"exit_node_id"`
|
|
ExitNodeIDs []string `json:"exit_node_ids"`
|
|
}
|
|
if len(assignment.PlacementPolicy) == 0 || json.Unmarshal(assignment.PlacementPolicy, &policy) != nil {
|
|
return true
|
|
}
|
|
if exitNodeID := strings.TrimSpace(policy.ExitNodeID); exitNodeID != "" {
|
|
return exitNodeID == localNodeID
|
|
}
|
|
for _, exitNodeID := range policy.ExitNodeIDs {
|
|
if strings.TrimSpace(exitNodeID) == localNodeID {
|
|
return true
|
|
}
|
|
}
|
|
return len(policy.ExitNodeIDs) == 0
|
|
}
|
|
|
|
func localGatewayTransportForAssignment(identity state.Identity, assignment client.NodeVPNAssignment, meshState *syntheticMeshState, _ *client.Client) vpnruntime.PacketTransport {
|
|
if meshState == nil || meshState.VPNFabricInbox == nil || assignment.VPNConnectionID == "" {
|
|
return nil
|
|
}
|
|
return &vpnruntime.LocalPacketTransport{
|
|
Inbox: meshState.VPNFabricInbox,
|
|
VPNConnectionID: assignment.VPNConnectionID,
|
|
}
|
|
}
|
|
|
|
func fabricGatewayTransportForAssignment(ctx context.Context, cfg config.Config, identity state.Identity, assignment client.NodeVPNAssignment, meshState *syntheticMeshState, _ *client.Client) vpnruntime.PacketTransport {
|
|
if meshState == nil || meshState.ProductionForwardTransport == nil || meshState.VPNFabricInbox == nil {
|
|
return nil
|
|
}
|
|
route, nextHop, ok := selectVPNPacketRoute(meshState.Routes, identity.ClusterID, identity.NodeID)
|
|
if !ok {
|
|
return nil
|
|
}
|
|
if cfg.VPNFabricSessionTransportEnabled {
|
|
if transport := fabricSessionGatewayTransportForAssignment(ctx, identity, assignment, meshState, nextHop); transport != nil {
|
|
return transport
|
|
}
|
|
}
|
|
return &vpnruntime.FabricPacketTransport{
|
|
ForwardTransport: meshState.ProductionForwardTransport,
|
|
Inbox: meshState.VPNFabricInbox,
|
|
ClusterID: identity.ClusterID,
|
|
VPNConnectionID: assignment.VPNConnectionID,
|
|
RouteID: route.RouteID,
|
|
LocalNodeID: identity.NodeID,
|
|
RemoteNodeID: route.DestinationNodeID,
|
|
NextHopNodeID: nextHop,
|
|
RoutePath: route.Hops,
|
|
SendDirection: vpnruntime.FabricDirectionGatewayToClient,
|
|
ReceiveDirection: vpnruntime.FabricDirectionClientToGateway,
|
|
}
|
|
}
|
|
|
|
func fabricSessionGatewayTransportForAssignment(ctx context.Context, identity state.Identity, assignment client.NodeVPNAssignment, meshState *syntheticMeshState, nextHop string) vpnruntime.PacketTransport {
|
|
if meshState == nil || meshState.VPNFabricInbox == nil || assignment.VPNConnectionID == "" || nextHop == "" {
|
|
return nil
|
|
}
|
|
targets := vpnFabricSessionTargets(meshState, nextHop)
|
|
if len(targets) == 0 {
|
|
log.Printf("vpn fabric session transport skipped: vpn_connection_id=%s next_hop=%s reason=peer_endpoint_missing", assignment.VPNConnectionID, nextHop)
|
|
return nil
|
|
}
|
|
if meshState.VPNFabricSessionDialStats == nil {
|
|
meshState.VPNFabricSessionDialStats = newVPNFabricSessionDialStats()
|
|
}
|
|
if meshState.VPNFabricEndpointObservations == nil {
|
|
meshState.VPNFabricEndpointObservations = newVPNFabricEndpointObservationStore(identity.NodeID)
|
|
}
|
|
meshState.VPNFabricSessionDialStats.Attempts.Add(1)
|
|
if meshState.VPNFabricSessionPeers == nil {
|
|
meshState.VPNFabricSessionPeers = mesh.NewFabricSessionPeerManager()
|
|
}
|
|
if meshState.VPNFabricTransport == nil {
|
|
meshState.VPNFabricTransport = mesh.NewWebSocketFabricTransport(meshState.VPNFabricSessionPeers)
|
|
}
|
|
token := fabricSessionGatewayToken(identity, assignment, nextHop)
|
|
for index, target := range targets {
|
|
startedAt := time.Now()
|
|
dialCtx, cancel := context.WithTimeout(ctx, 3*time.Second)
|
|
target.PeerID = nextHop
|
|
target.Token = token
|
|
target.Timeout = 3 * time.Second
|
|
target.OutboundBuffer = 256
|
|
target.InboundBuffer = 256
|
|
target.ErrorBuffer = 16
|
|
carrier, selectedTarget, err := mesh.FabricTransportForTarget(target, meshState.VPNFabricTransport, meshState.VPNFabricQUICTransport)
|
|
if err != nil {
|
|
cancel()
|
|
meshState.VPNFabricSessionDialStats.ObserveCandidateFailure("transport_select_failed")
|
|
meshState.VPNFabricEndpointObservations.ObserveFailure(target.EndpointID, "transport_select_failed")
|
|
log.Printf("vpn fabric session candidate skipped: vpn_connection_id=%s next_hop=%s candidate=%d endpoint=%s transport=%s reason=transport_select_failed error=%v", assignment.VPNConnectionID, nextHop, index, target.Endpoint, target.Transport, err)
|
|
continue
|
|
}
|
|
session, err := carrier.Connect(dialCtx, selectedTarget)
|
|
if err != nil {
|
|
cancel()
|
|
reason := fabricSessionOpenFailureReason(err)
|
|
if reason == "capacity_limited" {
|
|
meshState.VPNFabricSessionDialStats.ObserveCapacityLimited(selectedTarget)
|
|
meshState.VPNFabricEndpointObservations.ObserveCapacity(selectedTarget.EndpointID)
|
|
} else {
|
|
meshState.VPNFabricSessionDialStats.ObserveCandidateFailure(reason)
|
|
meshState.VPNFabricEndpointObservations.ObserveFailure(selectedTarget.EndpointID, reason)
|
|
}
|
|
log.Printf("vpn fabric session candidate skipped: vpn_connection_id=%s next_hop=%s candidate=%d endpoint=%s transport=%s reason=%s error=%v", assignment.VPNConnectionID, nextHop, index, selectedTarget.Endpoint, selectedTarget.Transport, reason, err)
|
|
continue
|
|
}
|
|
streamID := uint64(time.Now().UnixNano())
|
|
if streamID == 0 {
|
|
streamID = 1
|
|
}
|
|
if err := session.Send(dialCtx, fabricproto.Frame{
|
|
Type: fabricproto.FrameOpenStream,
|
|
StreamID: streamID,
|
|
TrafficClass: fabricproto.TrafficClassInteractive,
|
|
}); err != nil {
|
|
cancel()
|
|
_ = session.Close()
|
|
meshState.VPNFabricSessionDialStats.ObserveCandidateFailure("stream_open_failed")
|
|
meshState.VPNFabricEndpointObservations.ObserveFailure(selectedTarget.EndpointID, "stream_open_failed")
|
|
log.Printf("vpn fabric session candidate skipped: vpn_connection_id=%s next_hop=%s candidate=%d endpoint=%s transport=%s reason=stream_open_failed error=%v", assignment.VPNConnectionID, nextHop, index, selectedTarget.Endpoint, selectedTarget.Transport, err)
|
|
continue
|
|
}
|
|
cancel()
|
|
meshState.VPNFabricSessionDialStats.ObserveSelected(selectedTarget)
|
|
meshState.VPNFabricEndpointObservations.ObserveSuccess(selectedTarget.EndpointID, time.Since(startedAt))
|
|
log.Printf("vpn fabric session transport selected: vpn_connection_id=%s next_hop=%s candidate=%d endpoint=%s transport=%s pinned_cert=%t fallback_candidates=%d", assignment.VPNConnectionID, nextHop, index, selectedTarget.Endpoint, selectedTarget.Transport, selectedTarget.PeerCertSHA256 != "", len(targets)-index-1)
|
|
return &vpnruntime.FabricSessionPacketTransport{
|
|
Sender: session,
|
|
Receiver: session,
|
|
Inbox: meshState.VPNFabricInbox,
|
|
StreamID: streamID,
|
|
VPNConnectionID: assignment.VPNConnectionID,
|
|
SendDirection: vpnruntime.FabricDirectionGatewayToClient,
|
|
ReceiveDirection: vpnruntime.FabricDirectionClientToGateway,
|
|
TrafficClass: vpnruntime.FabricTrafficClassInteractive,
|
|
}
|
|
}
|
|
meshState.VPNFabricSessionDialStats.ObserveAllCandidatesFailed()
|
|
log.Printf("vpn fabric session transport skipped: vpn_connection_id=%s next_hop=%s reason=all_candidates_failed candidates=%d", assignment.VPNConnectionID, nextHop, len(targets))
|
|
return nil
|
|
}
|
|
|
|
func fabricSessionOpenFailureReason(err error) string {
|
|
if err == nil {
|
|
return ""
|
|
}
|
|
if errors.Is(err, mesh.ErrQUICFabricStreamLimitReached) {
|
|
return "capacity_limited"
|
|
}
|
|
return "session_open_failed"
|
|
}
|
|
|
|
func vpnFabricSessionTarget(meshState *syntheticMeshState, nextHop string) (mesh.FabricTransportTarget, bool) {
|
|
targets := vpnFabricSessionTargets(meshState, nextHop)
|
|
if len(targets) == 0 {
|
|
return mesh.FabricTransportTarget{}, false
|
|
}
|
|
return targets[0], true
|
|
}
|
|
|
|
func vpnFabricSessionTargets(meshState *syntheticMeshState, nextHop string) []mesh.FabricTransportTarget {
|
|
if meshState == nil {
|
|
return nil
|
|
}
|
|
out := make([]mesh.FabricTransportTarget, 0, len(meshState.PeerEndpointCandidates[nextHop])+1)
|
|
seen := map[string]struct{}{}
|
|
if candidates := meshState.PeerEndpointCandidates[nextHop]; len(candidates) > 0 {
|
|
ranked := mesh.RankPeerEndpointCandidates(candidates, mesh.EndpointCandidateScoreOptions{
|
|
ChannelClass: mesh.SyntheticChannelFabricControl,
|
|
Now: time.Now().UTC(),
|
|
MaxVerificationAge: 5 * time.Minute,
|
|
Observations: mergedEndpointCandidateObservations(meshState.PeerEndpointObservations, meshState.VPNFabricEndpointObservations.Snapshot()),
|
|
MaxObservationAge: 5 * time.Minute,
|
|
})
|
|
for _, item := range ranked {
|
|
endpoint := strings.TrimRight(strings.TrimSpace(item.Candidate.Address), "/")
|
|
if endpoint == "" {
|
|
continue
|
|
}
|
|
key := item.Candidate.Transport + "\x00" + endpoint
|
|
if _, ok := seen[key]; ok {
|
|
continue
|
|
}
|
|
seen[key] = struct{}{}
|
|
out = append(out, mesh.FabricTransportTarget{
|
|
EndpointID: item.Candidate.EndpointID,
|
|
Endpoint: endpoint,
|
|
Transport: item.Candidate.Transport,
|
|
PeerCertSHA256: endpointCandidateTLSCertSHA256(item.Candidate),
|
|
})
|
|
}
|
|
}
|
|
endpoint := strings.TrimRight(strings.TrimSpace(meshState.PeerEndpoints[nextHop]), "/")
|
|
if endpoint != "" {
|
|
key := "\x00" + endpoint
|
|
if _, ok := seen[key]; !ok {
|
|
out = append(out, mesh.FabricTransportTarget{Endpoint: endpoint})
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func mergedEndpointCandidateObservations(remote map[string]mesh.EndpointCandidateHealthObservation, local map[string]mesh.EndpointCandidateHealthObservation) map[string]mesh.EndpointCandidateHealthObservation {
|
|
if len(remote) == 0 && len(local) == 0 {
|
|
return nil
|
|
}
|
|
out := make(map[string]mesh.EndpointCandidateHealthObservation, len(remote)+len(local))
|
|
for endpointID, observation := range remote {
|
|
out[endpointID] = observation
|
|
}
|
|
for endpointID, observation := range local {
|
|
if existing, ok := out[endpointID]; ok && !observation.ObservedAt.IsZero() && !existing.ObservedAt.IsZero() && existing.ObservedAt.After(observation.ObservedAt) {
|
|
continue
|
|
}
|
|
out[endpointID] = observation
|
|
}
|
|
return out
|
|
}
|
|
|
|
func endpointCandidateTLSCertSHA256(candidate mesh.PeerEndpointCandidate) string {
|
|
if len(candidate.Metadata) == 0 {
|
|
return ""
|
|
}
|
|
var metadata struct {
|
|
TLSCertSHA256 string `json:"tls_cert_sha256"`
|
|
}
|
|
if err := json.Unmarshal(candidate.Metadata, &metadata); err != nil {
|
|
return ""
|
|
}
|
|
return strings.TrimSpace(metadata.TLSCertSHA256)
|
|
}
|
|
|
|
func fabricSessionGatewayToken(identity state.Identity, assignment client.NodeVPNAssignment, nextHop string) string {
|
|
tokenParts := []string{
|
|
"rap_fsn_vpn",
|
|
strings.ReplaceAll(identity.NodeID, "-", "_"),
|
|
strings.ReplaceAll(nextHop, "-", "_"),
|
|
strings.ReplaceAll(assignment.VPNConnectionID, "-", "_"),
|
|
}
|
|
return strings.Join(tokenParts, "_")
|
|
}
|
|
|
|
func selectVPNPacketRoute(routes []mesh.SyntheticRoute, clusterID string, localNodeID string) (mesh.SyntheticRoute, string, bool) {
|
|
now := time.Now().UTC()
|
|
for _, route := range routes {
|
|
if route.ClusterID != clusterID || route.SourceNodeID != localNodeID || !containsString(route.AllowedChannels, mesh.ProductionChannelVPNPacket) {
|
|
continue
|
|
}
|
|
if !route.ExpiresAt.IsZero() && !route.ExpiresAt.After(now) {
|
|
continue
|
|
}
|
|
nextHop := nextRouteHop(route.Hops, localNodeID, route.DestinationNodeID)
|
|
if nextHop == "" || nextHop == localNodeID {
|
|
continue
|
|
}
|
|
return route, nextHop, true
|
|
}
|
|
return mesh.SyntheticRoute{}, "", false
|
|
}
|
|
|
|
func nextRouteHop(path []string, localNodeID string, destinationNodeID string) string {
|
|
if len(path) == 0 {
|
|
return destinationNodeID
|
|
}
|
|
for index, nodeID := range path {
|
|
if nodeID == localNodeID {
|
|
if index+1 < len(path) {
|
|
return path[index+1]
|
|
}
|
|
return localNodeID
|
|
}
|
|
}
|
|
return destinationNodeID
|
|
}
|
|
|
|
func renewOwnedVPNLease(ctx context.Context, api *client.Client, identity state.Identity, assignment client.NodeVPNAssignment) error {
|
|
if assignment.ActiveLease == nil || assignment.ActiveLease.OwnerNodeID != identity.NodeID {
|
|
return nil
|
|
}
|
|
if err := api.RenewNodeVPNAssignmentLease(ctx, identity.ClusterID, identity.NodeID, assignment.VPNConnectionID, assignment.ActiveLease.LeaseID, client.NodeVPNAssignmentLeaseRenewRequest{
|
|
TTLSeconds: 300,
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
log.Printf("vpn lease renewed: vpn_connection_id=%s lease_id=%s ttl_seconds=300", assignment.VPNConnectionID, assignment.ActiveLease.LeaseID)
|
|
return nil
|
|
}
|