рабочий вариант, но скороть 10 МБит
build / backend (push) Has been cancelled
build / node-agent (push) Has been cancelled
build / worker (push) Has been cancelled

This commit is contained in:
2026-05-22 21:46:49 +03:00
parent 469fa0e860
commit 20d361a886
280 changed files with 954890 additions and 18524 deletions
@@ -7,7 +7,7 @@ import (
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
const Version = "0.2.321-directreadytarget"
const Version = "0.2.372-vpn-opaque-channel"
func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) client.EnrollRequest {
return client.EnrollRequest{
+29 -367
View File
@@ -1,22 +1,11 @@
package client
import (
"bytes"
"context"
"encoding/binary"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
)
type Client struct {
baseURL string
httpClient *http.Client
}
type Client struct{}
type RawControlRequest struct {
Method string `json:"method"`
@@ -45,19 +34,19 @@ type EnrollResponse struct {
JoinRequest json.RawMessage `json:"join_request"`
}
type EnrollmentBootstrapRequest struct {
type EnrollmentJoinRequest struct {
ClusterID string `json:"cluster_id"`
NodeFingerprint string `json:"node_fingerprint"`
PublicKey string `json:"public_key"`
}
type EnrollmentBootstrapResponse struct {
Status string `json:"status"`
JoinRequest json.RawMessage `json:"join_request"`
Bootstrap *NodeBootstrap `json:"node_bootstrap,omitempty"`
type EnrollmentJoinResponse struct {
Status string `json:"status"`
JoinRequest json.RawMessage `json:"join_request"`
JoinContract *NodeJoinContract `json:"node_join,omitempty"`
}
type NodeBootstrap struct {
type NodeJoinContract struct {
NodeID string `json:"node_id"`
ClusterID string `json:"cluster_id"`
IdentityStatus string `json:"identity_status"`
@@ -84,15 +73,19 @@ type HeartbeatResponse struct {
}
type NodeUpdateHint struct {
SchemaVersion string `json:"schema_version"`
Generation string `json:"generation,omitempty"`
CheckNow bool `json:"check_now"`
Products []string `json:"products,omitempty"`
Reason string `json:"reason,omitempty"`
DeliveryMode string `json:"delivery_mode,omitempty"`
SubscriptionStatus string `json:"subscription_status,omitempty"`
UpdateService *NodeUpdateServiceAssignment `json:"update_service,omitempty"`
FallbackPollSeconds int `json:"fallback_poll_seconds,omitempty"`
SchemaVersion string `json:"schema_version"`
Generation string `json:"generation,omitempty"`
CheckNow bool `json:"check_now"`
Products []string `json:"products,omitempty"`
TargetVersions map[string]string `json:"target_versions,omitempty"`
Reason string `json:"reason,omitempty"`
DeliveryMode string `json:"delivery_mode,omitempty"`
SubscriptionStatus string `json:"subscription_status,omitempty"`
UpdateService *NodeUpdateServiceAssignment `json:"update_service,omitempty"`
UpdateServiceCandidates []NodeUpdateServiceAssignment `json:"update_service_candidates,omitempty"`
RescuePollSeconds int `json:"rescue_poll_seconds,omitempty"`
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"`
}
type NodeUpdateServiceAssignment struct {
@@ -207,6 +200,13 @@ type NodeVPNAssignmentLease struct {
}
type NodeVPNAssignment struct {
TunnelID string `json:"tunnel_id,omitempty"`
PoolID string `json:"pool_id,omitempty"`
ServiceID string `json:"service_id,omitempty"`
LocalServiceID string `json:"local_service_id,omitempty"`
RemoteServiceID string `json:"remote_service_id,omitempty"`
ServiceKind string `json:"service_kind,omitempty"`
ServiceClass string `json:"service_class,omitempty"`
VPNConnectionID string `json:"vpn_connection_id"`
ClusterID string `json:"cluster_id"`
OrganizationID string `json:"organization_id"`
@@ -624,6 +624,7 @@ type EndpointCandidateHealthObservation struct {
EndpointID string `json:"endpoint_id"`
Source string `json:"source,omitempty"`
ReporterNodeID string `json:"reporter_node_id,omitempty"`
ReporterRegion string `json:"reporter_region,omitempty"`
LastLatencyMs int64 `json:"last_latency_ms,omitempty"`
SuccessCount uint64 `json:"success_count,omitempty"`
FailureCount uint64 `json:"failure_count,omitempty"`
@@ -632,343 +633,4 @@ type EndpointCandidateHealthObservation struct {
ObservedAt time.Time `json:"observed_at,omitempty"`
}
func New(baseURL string) *Client {
return &Client{
baseURL: baseURL,
httpClient: &http.Client{
Timeout: 15 * time.Second,
},
}
}
func (c *Client) Enroll(ctx context.Context, request EnrollRequest) (EnrollResponse, error) {
var response EnrollResponse
if err := c.postJSON(ctx, "/node-agents/enroll", request, &response); err != nil {
return EnrollResponse{}, err
}
return response, nil
}
func (c *Client) BootstrapEnrollment(ctx context.Context, joinRequestID string, request EnrollmentBootstrapRequest) (EnrollmentBootstrapResponse, error) {
var response EnrollmentBootstrapResponse
path := fmt.Sprintf("/node-agents/enrollments/%s/bootstrap", joinRequestID)
if err := c.postJSON(ctx, path, request, &response); err != nil {
return EnrollmentBootstrapResponse{}, err
}
return response, nil
}
func (c *Client) Heartbeat(ctx context.Context, clusterID, nodeID string, request HeartbeatRequest) (HeartbeatResponse, error) {
var response HeartbeatResponse
path := fmt.Sprintf("/clusters/%s/nodes/%s/heartbeats", clusterID, nodeID)
if err := c.postJSON(ctx, path, request, &response); err != nil {
return HeartbeatResponse{}, err
}
return response, nil
}
func (c *Client) NodeUpdatePlan(ctx context.Context, clusterID, nodeID string, request NodeUpdatePlanRequest) (NodeUpdatePlan, error) {
values := url.Values{}
values.Set("product", request.Product)
values.Set("current_version", request.CurrentVersion)
values.Set("os", request.OS)
values.Set("arch", request.Arch)
values.Set("install_type", request.InstallType)
if request.Channel != "" {
values.Set("channel", request.Channel)
}
var response NodeUpdatePlanResponse
path := fmt.Sprintf("/clusters/%s/nodes/%s/updates/plan?%s", clusterID, nodeID, values.Encode())
if err := c.getJSON(ctx, path, &response); err != nil {
return NodeUpdatePlan{}, err
}
return response.Plan, nil
}
func (c *Client) ReportNodeUpdateStatus(ctx context.Context, clusterID, nodeID string, request NodeUpdateStatusRequest) error {
path := fmt.Sprintf("/clusters/%s/nodes/%s/updates/status", clusterID, nodeID)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) DesiredWorkloads(ctx context.Context, clusterID, nodeID string) ([]DesiredWorkload, error) {
var response struct {
DesiredWorkloads []DesiredWorkload `json:"desired_workloads"`
}
path := fmt.Sprintf("/clusters/%s/nodes/%s/workloads/desired", clusterID, nodeID)
if err := c.getJSON(ctx, path, &response); err != nil {
return nil, err
}
return response.DesiredWorkloads, nil
}
func (c *Client) ReportWorkloadStatus(ctx context.Context, clusterID, nodeID, serviceType string, request WorkloadStatusRequest) error {
path := fmt.Sprintf("/clusters/%s/nodes/%s/workloads/%s/status", clusterID, nodeID, serviceType)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) NodeVPNAssignments(ctx context.Context, clusterID, nodeID string) ([]NodeVPNAssignment, error) {
var response struct {
Assignments []NodeVPNAssignment `json:"vpn_assignments"`
}
path := fmt.Sprintf("/clusters/%s/nodes/%s/vpn/assignments", clusterID, nodeID)
if err := c.getJSON(ctx, path, &response); err != nil {
return nil, err
}
return response.Assignments, nil
}
func (c *Client) ReportNodeVPNAssignmentStatus(ctx context.Context, clusterID, nodeID, vpnConnectionID string, request NodeVPNAssignmentStatusRequest) error {
path := fmt.Sprintf("/clusters/%s/nodes/%s/vpn/assignments/%s/status", clusterID, nodeID, vpnConnectionID)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) AcquireNodeVPNAssignmentLease(ctx context.Context, clusterID, nodeID, vpnConnectionID string, request NodeVPNAssignmentLeaseAcquireRequest) (*NodeVPNAssignmentLease, error) {
var response struct {
Lease NodeVPNAssignmentLease `json:"lease"`
}
path := fmt.Sprintf("/clusters/%s/nodes/%s/vpn/assignments/%s/lease/acquire", clusterID, nodeID, vpnConnectionID)
if err := c.postJSON(ctx, path, request, &response); err != nil {
return nil, err
}
return &response.Lease, nil
}
func (c *Client) RenewNodeVPNAssignmentLease(ctx context.Context, clusterID, nodeID, vpnConnectionID, leaseID string, request NodeVPNAssignmentLeaseRenewRequest) error {
path := fmt.Sprintf("/clusters/%s/nodes/%s/vpn/assignments/%s/lease/%s/renew", clusterID, nodeID, vpnConnectionID, leaseID)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) SendVPNGatewayPacket(ctx context.Context, clusterID, vpnConnectionID string, packet []byte) error {
if len(packet) == 0 {
return nil
}
path := fmt.Sprintf("/clusters/%s/vpn-connections/%s/tunnel/gateway/packets", clusterID, vpnConnectionID)
return c.postBytes(ctx, path, packet)
}
func (c *Client) SendVPNGatewayPacketBatch(ctx context.Context, clusterID, vpnConnectionID string, packets [][]byte) error {
packets = cleanVPNPacketBatch(packets)
if len(packets) == 0 {
return nil
}
path := fmt.Sprintf("/clusters/%s/vpn-connections/%s/tunnel/gateway/packets?batch=true", clusterID, vpnConnectionID)
return c.postBytes(ctx, path, encodeVPNPacketBatch(packets))
}
func (c *Client) ReceiveVPNGatewayPacket(ctx context.Context, clusterID, vpnConnectionID string, timeout time.Duration) ([]byte, bool, error) {
path := fmt.Sprintf("/clusters/%s/vpn-connections/%s/tunnel/gateway/packets?timeout_ms=%d", clusterID, vpnConnectionID, timeout.Milliseconds())
return c.getBytes(ctx, path)
}
func (c *Client) ReceiveVPNGatewayPacketBatch(ctx context.Context, clusterID, vpnConnectionID string, timeout time.Duration) ([][]byte, error) {
path := fmt.Sprintf("/clusters/%s/vpn-connections/%s/tunnel/gateway/packets?batch=true&timeout_ms=%d", clusterID, vpnConnectionID, timeout.Milliseconds())
payload, ok, err := c.getBytes(ctx, path)
if err != nil || !ok {
return nil, err
}
return decodeVPNPacketBatch(payload)
}
func (c *Client) ReportMeshLink(ctx context.Context, clusterID string, request MeshLinkObservationRequest) error {
path := fmt.Sprintf("/clusters/%s/mesh/links", clusterID)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) ReportTelemetry(ctx context.Context, clusterID, nodeID string, request TelemetryRequest) error {
path := fmt.Sprintf("/clusters/%s/nodes/%s/telemetry", clusterID, nodeID)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) SyntheticMeshConfig(ctx context.Context, clusterID, nodeID string) (SyntheticMeshConfig, error) {
var response struct {
Config SyntheticMeshConfig `json:"synthetic_mesh_config"`
}
path := fmt.Sprintf("/clusters/%s/nodes/%s/mesh/synthetic-config", clusterID, nodeID)
if err := c.getJSON(ctx, path, &response); err != nil {
return SyntheticMeshConfig{}, err
}
return response.Config, nil
}
func (c *Client) AdminRuntimeProjection(ctx context.Context, clusterID, nodeID string, request AdminRuntimeProjectionRequest) (AdminRuntimeProjectionResponse, error) {
var response AdminRuntimeProjectionResponse
path := fmt.Sprintf("/clusters/%s/nodes/%s/admin-runtime/projection", clusterID, nodeID)
if err := c.postJSON(ctx, path, request, &response); err != nil {
return AdminRuntimeProjectionResponse{}, err
}
return response, nil
}
func (c *Client) RawControl(ctx context.Context, request RawControlRequest) (RawControlResponse, error) {
method := strings.ToUpper(strings.TrimSpace(request.Method))
if method == "" {
method = http.MethodGet
}
path := strings.TrimSpace(request.Path)
if !strings.HasPrefix(path, "/") {
return RawControlResponse{}, fmt.Errorf("control path must be relative")
}
var body io.Reader
if len(request.Body) > 0 && string(request.Body) != "null" {
body = bytes.NewReader(request.Body)
}
httpReq, err := http.NewRequestWithContext(ctx, method, c.baseURL+path, body)
if err != nil {
return RawControlResponse{}, err
}
if body != nil {
httpReq.Header.Set("Content-Type", "application/json")
}
httpResp, err := c.httpClient.Do(httpReq)
if err != nil {
return RawControlResponse{}, err
}
defer httpResp.Body.Close()
payload, err := io.ReadAll(io.LimitReader(httpResp.Body, 2*1024*1024))
if err != nil {
return RawControlResponse{}, err
}
return RawControlResponse{StatusCode: httpResp.StatusCode, Body: json.RawMessage(payload)}, nil
}
func (c *Client) getJSON(ctx context.Context, path string, response any) error {
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil)
if err != nil {
return err
}
httpResp, err := c.httpClient.Do(httpReq)
if err != nil {
return err
}
defer httpResp.Body.Close()
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
return fmt.Errorf("backend returned status %d", httpResp.StatusCode)
}
if response == nil {
return nil
}
return json.NewDecoder(httpResp.Body).Decode(response)
}
func (c *Client) getBytes(ctx context.Context, path string) ([]byte, bool, error) {
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil)
if err != nil {
return nil, false, err
}
httpResp, err := c.httpClient.Do(httpReq)
if err != nil {
return nil, false, err
}
defer httpResp.Body.Close()
if httpResp.StatusCode == http.StatusNoContent {
return nil, false, nil
}
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
return nil, false, fmt.Errorf("backend returned status %d", httpResp.StatusCode)
}
payload, err := io.ReadAll(io.LimitReader(httpResp.Body, vpnPacketBatchMaxBytes))
if err != nil {
return nil, false, err
}
if len(payload) == 0 {
return nil, false, nil
}
return payload, true, nil
}
func (c *Client) postBytes(ctx context.Context, path string, payload []byte) error {
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(payload))
if err != nil {
return err
}
httpReq.Header.Set("Content-Type", "application/octet-stream")
httpResp, err := c.httpClient.Do(httpReq)
if err != nil {
return err
}
defer httpResp.Body.Close()
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
return fmt.Errorf("backend returned status %d", httpResp.StatusCode)
}
return nil
}
func (c *Client) postJSON(ctx context.Context, path string, request any, response any) error {
payload, err := json.Marshal(request)
if err != nil {
return err
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(payload))
if err != nil {
return err
}
httpReq.Header.Set("Content-Type", "application/json")
httpResp, err := c.httpClient.Do(httpReq)
if err != nil {
return err
}
defer httpResp.Body.Close()
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
return fmt.Errorf("backend returned status %d", httpResp.StatusCode)
}
if response == nil {
return nil
}
return json.NewDecoder(httpResp.Body).Decode(response)
}
const (
vpnPacketMaxBytes = 65535
vpnPacketBatchMaxBytes = 4 * 1024 * 1024
)
func encodeVPNPacketBatch(packets [][]byte) []byte {
packets = cleanVPNPacketBatch(packets)
total := 0
for _, packet := range packets {
total += 4 + len(packet)
}
out := make([]byte, total)
offset := 0
for _, packet := range packets {
binary.BigEndian.PutUint32(out[offset:offset+4], uint32(len(packet)))
offset += 4
copy(out[offset:offset+len(packet)], packet)
offset += len(packet)
}
return out
}
func decodeVPNPacketBatch(payload []byte) ([][]byte, error) {
var packets [][]byte
for offset := 0; offset < len(payload); {
if offset+4 > len(payload) {
return nil, fmt.Errorf("truncated vpn packet batch header")
}
size := int(binary.BigEndian.Uint32(payload[offset : offset+4]))
offset += 4
if size <= 0 || size > vpnPacketMaxBytes {
return nil, fmt.Errorf("invalid vpn packet batch item size")
}
if offset+size > len(payload) {
return nil, fmt.Errorf("truncated vpn packet batch item")
}
packets = append(packets, append([]byte(nil), payload[offset:offset+size]...))
offset += size
}
return cleanVPNPacketBatch(packets), nil
}
func cleanVPNPacketBatch(packets [][]byte) [][]byte {
if len(packets) == 0 {
return nil
}
cleaned := make([][]byte, 0, len(packets))
for _, packet := range packets {
if len(packet) == 0 {
continue
}
cleaned = append(cleaned, append([]byte(nil), packet...))
}
return cleaned
}
func New(_ string) *Client { return &Client{} }
+42 -68
View File
@@ -14,7 +14,6 @@ import (
const MaxMeshProductionObservationSinkCapacity = 10000
type Config struct {
BackendURL string
ClusterID string
ClusterAuthorityPublicKey string
ClusterAuthorityFingerprint string
@@ -30,7 +29,7 @@ type Config struct {
HeartbeatInterval time.Duration
EnrollmentPollInterval time.Duration
EnrollmentPollTimeout time.Duration
MeshSyntheticRuntimeEnabled bool
FabricRuntimeEnabled bool
MeshProductionForwardingEnabled bool
VPNFabricSessionTransportEnabled bool
MeshQUICFabricEnabled bool
@@ -39,17 +38,18 @@ type Config struct {
VPNFabricQUICMaxStreamsPerConn int
VPNFabricQUICIdleTTL time.Duration
MeshProductionObservationSinkCapacity int
MeshListenAddr string
MeshListenPortMode string
MeshListenAutoPortStart int
MeshListenAutoPortEnd int
FabricListenAddr string
FabricListenPortMode string
FabricListenAutoPortStart int
FabricListenAutoPortEnd int
MeshAdvertiseEndpoint string
MeshAdvertiseEndpointsJSON string
FabricRegistryRecordsJSON string
MeshAdvertiseTransport string
MeshConnectivityMode string
MeshNATType string
MeshLocalSegmentID string
MeshSiteID string
MeshLocalityGroupID string
MeshNATGroupID string
MeshSTUNReflexiveEndpoint string
MeshSTUNServer string
@@ -72,7 +72,6 @@ func Load(args []string, env map[string]string) (Config, error) {
defaultStateDir := filepath.Join(".", ".rap-node-agent")
fs := flag.NewFlagSet("rap-node-agent", flag.ContinueOnError)
cfg := Config{}
fs.StringVar(&cfg.BackendURL, "backend-url", getEnv(env, "RAP_BACKEND_URL", "http://127.0.0.1:8080/api/v1"), "Backend API base URL.")
fs.StringVar(&cfg.ClusterID, "cluster-id", getEnv(env, "RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&cfg.ClusterAuthorityPublicKey, "cluster-authority-public-key", getEnv(env, "RAP_CLUSTER_AUTHORITY_PUBLIC_KEY", ""), "Pinned cluster authority Ed25519 public key.")
fs.StringVar(&cfg.ClusterAuthorityFingerprint, "cluster-authority-fingerprint", getEnv(env, "RAP_CLUSTER_AUTHORITY_FINGERPRINT", ""), "Pinned cluster authority key fingerprint.")
@@ -85,26 +84,27 @@ func Load(args []string, env map[string]string) (Config, error) {
fs.StringVar(&cfg.WebIngressSigningKeyID, "web-ingress-signing-key-id", getEnv(env, "RAP_WEB_INGRESS_SIGNING_KEY_ID", ""), "Optional key id for web ingress envelope signatures.")
fs.StringVar(&cfg.WebIngressTrustedKeysJSON, "web-ingress-trusted-keys-json", getEnv(env, "RAP_WEB_INGRESS_TRUSTED_KEYS_JSON", ""), "JSON map or array of trusted Ed25519 public keys for web ingress runtime receiver.")
fs.StringVar(&cfg.WebIngressRuntimeServiceClasses, "web-ingress-runtime-service-classes", getEnv(env, "RAP_WEB_INGRESS_RUNTIME_SERVICE_CLASSES", ""), "Optional comma-separated allow-list of web ingress runtime service classes accepted by this node.")
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getEnvBool(env, "RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable C17A synthetic fabric probe runtime. Disabled by default.")
fs.BoolVar(&cfg.FabricRuntimeEnabled, "fabric-runtime-enabled", getEnvBool(env, "RAP_FABRIC_RUNTIME_ENABLED", false), "Enable C17A synthetic fabric probe runtime. Disabled by default.")
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getEnvBool(env, "RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production fabric-control direct next-hop forwarding gate. Disabled by default.")
fs.BoolVar(&cfg.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getEnvBool(env, "RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric session when explicitly enabled. Disabled by default.")
fs.BoolVar(&cfg.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getEnvBool(env, "RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener. Disabled by default.")
fs.StringVar(&cfg.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getEnv(env, "RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "Listen address for QUIC/UDP fabric endpoint, for example :19443.")
fs.IntVar(&cfg.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getEnvInt(env, "RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.")
fs.IntVar(&cfg.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getEnvInt(env, "RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 8), "VPN fabric-session stream shards per traffic class.")
fs.IntVar(&cfg.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getEnvInt(env, "RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.")
fs.DurationVar(&cfg.VPNFabricQUICIdleTTL, "vpn-fabric-quic-idle-ttl", time.Duration(getEnvInt(env, "RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300))*time.Second, "Idle TTL for cached VPN QUIC carrier connections.")
fs.IntVar(&cfg.MeshProductionObservationSinkCapacity, "mesh-production-observation-sink-capacity", getEnvSignedInt(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Bounded local metadata-only production envelope observation sink capacity. Disabled when 0.")
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getEnv(env, "RAP_MESH_LISTEN_ADDR", ""), "Listen address for disabled-by-default historical synthetic mesh HTTP endpoint.")
fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getEnv(env, "RAP_MESH_LISTEN_PORT_MODE", "manual"), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.FabricListenAddr, "fabric-listen-addr", getEnv(env, "RAP_FABRIC_LISTEN_ADDR", ""), "Optional node listener address used by the QUIC fabric runtime contract.")
fs.StringVar(&cfg.FabricListenPortMode, "fabric-listen-port-mode", getEnv(env, "RAP_FABRIC_LISTEN_PORT_MODE", "manual"), "Fabric listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.FabricListenAutoPortStart, "fabric-listen-auto-port-start", getEnvInt(env, "RAP_FABRIC_LISTEN_AUTO_PORT_START", 19131), "First port used when fabric listen port mode is auto.")
fs.IntVar(&cfg.FabricListenAutoPortEnd, "fabric-listen-auto-port-end", getEnvInt(env, "RAP_FABRIC_LISTEN_AUTO_PORT_END", 19231), "Last port used when fabric listen port mode is auto.")
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint reported to the Control Plane. Empty disables endpoint reporting.")
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "JSON array of advertised mesh endpoint candidates, including private/corporate endpoints.")
fs.StringVar(&cfg.FabricRegistryRecordsJSON, "fabric-registry-records-json", getEnv(env, "RAP_FABRIC_REGISTRY_RECORDS_JSON", ""), "JSON array of signed QUIC-only fabric registry gossip records used as bootstrap discovery seeds.")
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getEnv(env, "RAP_MESH_ADVERTISE_TRANSPORT", "quic"), "Transport label for the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getEnv(env, "RAP_MESH_CONNECTIVITY_MODE", "direct"), "Connectivity mode reported with the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getEnv(env, "RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint reported with the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshLocalSegmentID, "mesh-local-segment-id", getEnv(env, "RAP_MESH_LOCAL_SEGMENT_ID", ""), "Optional local LAN/site segment ID advertised with QUIC endpoint candidates.")
fs.StringVar(&cfg.MeshSiteID, "mesh-site-id", getEnv(env, "RAP_MESH_SITE_ID", ""), "Optional physical or logical site identifier advertised with QUIC endpoint candidates.")
fs.StringVar(&cfg.MeshLocalityGroupID, "mesh-locality-group-id", getEnv(env, "RAP_MESH_LOCALITY_GROUP_ID", ""), "Optional locality group identifier used to decide whether private QUIC endpoints are actually local.")
fs.StringVar(&cfg.MeshNATGroupID, "mesh-nat-group-id", getEnv(env, "RAP_MESH_NAT_GROUP_ID", ""), "Optional NAT group ID advertised with QUIC endpoint candidates.")
fs.StringVar(&cfg.MeshSTUNReflexiveEndpoint, "mesh-stun-reflexive-endpoint", getEnv(env, "RAP_MESH_STUN_REFLEXIVE_ENDPOINT", ""), "Optional STUN-discovered reflexive QUIC endpoint, for example quic://203.0.113.10:19443.")
fs.StringVar(&cfg.MeshSTUNServer, "mesh-stun-server", getEnv(env, "RAP_MESH_STUN_SERVER", ""), "Optional STUN server name used to discover the reflexive endpoint.")
@@ -127,21 +127,20 @@ func Load(args []string, env map[string]string) (Config, error) {
if err := fs.Parse(args); err != nil {
return Config{}, err
}
cfg.BackendURL = strings.TrimRight(strings.TrimSpace(cfg.BackendURL), "/")
cfg.ClusterID = strings.TrimSpace(cfg.ClusterID)
cfg.ClusterAuthorityPublicKey = strings.TrimSpace(cfg.ClusterAuthorityPublicKey)
cfg.ClusterAuthorityFingerprint = strings.TrimSpace(cfg.ClusterAuthorityFingerprint)
cfg.JoinToken = strings.TrimSpace(cfg.JoinToken)
cfg.NodeName = strings.TrimSpace(cfg.NodeName)
cfg.StateDir = strings.TrimSpace(cfg.StateDir)
cfg.MeshListenAddr = strings.TrimSpace(cfg.MeshListenAddr)
cfg.FabricListenAddr = strings.TrimSpace(cfg.FabricListenAddr)
cfg.MeshQUICFabricListenAddr = strings.TrimSpace(cfg.MeshQUICFabricListenAddr)
cfg.MeshListenPortMode = strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode))
cfg.FabricListenPortMode = strings.ToLower(strings.TrimSpace(cfg.FabricListenPortMode))
if cfg.VPNFabricSessionStreamShards <= 0 {
cfg.VPNFabricSessionStreamShards = 4
cfg.VPNFabricSessionStreamShards = 8
}
if cfg.VPNFabricSessionStreamShards > 64 {
cfg.VPNFabricSessionStreamShards = 64
if cfg.VPNFabricSessionStreamShards > 128 {
cfg.VPNFabricSessionStreamShards = 128
}
if cfg.VPNFabricQUICMaxStreamsPerConn <= 0 {
cfg.VPNFabricQUICMaxStreamsPerConn = 64
@@ -156,16 +155,15 @@ func Load(args []string, env map[string]string) (Config, error) {
if cfg.MeshAdvertiseTransport == "" {
cfg.MeshAdvertiseTransport = "quic"
}
cfg.MeshAdvertiseTransport = normalizeLegacyAdvertiseTransport(cfg.MeshAdvertiseTransport)
cfg.MeshAdvertiseEndpoint = normalizeLegacyEndpointSchemeToQUIC(cfg.MeshAdvertiseEndpoint)
cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode)
cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType)
cfg.MeshLocalSegmentID = strings.TrimSpace(cfg.MeshLocalSegmentID)
cfg.MeshSiteID = strings.TrimSpace(cfg.MeshSiteID)
cfg.MeshLocalityGroupID = strings.TrimSpace(cfg.MeshLocalityGroupID)
cfg.MeshNATGroupID = strings.TrimSpace(cfg.MeshNATGroupID)
cfg.MeshSTUNReflexiveEndpoint = normalizeLegacyEndpointSchemeToQUIC(strings.TrimRight(strings.TrimSpace(cfg.MeshSTUNReflexiveEndpoint), "/"))
cfg.MeshSTUNReflexiveEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshSTUNReflexiveEndpoint), "/")
cfg.MeshSTUNServer = strings.TrimSpace(cfg.MeshSTUNServer)
cfg.MeshRelayNodeID = strings.TrimSpace(cfg.MeshRelayNodeID)
cfg.MeshRelayEndpoint = normalizeLegacyEndpointSchemeToQUIC(strings.TrimRight(strings.TrimSpace(cfg.MeshRelayEndpoint), "/"))
cfg.MeshRelayEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshRelayEndpoint), "/")
cfg.MeshRegion = strings.TrimSpace(cfg.MeshRegion)
cfg.MeshSyntheticConfigPath = strings.TrimSpace(cfg.MeshSyntheticConfigPath)
cfg.MeshPeerEndpointsJSON = strings.TrimSpace(cfg.MeshPeerEndpointsJSON)
@@ -177,8 +175,8 @@ func Load(args []string, env map[string]string) (Config, error) {
cfg.RemoteWorkspaceRealAdapterCommand = strings.TrimSpace(cfg.RemoteWorkspaceRealAdapterCommand)
cfg.RemoteWorkspaceRealAdapterArgsJSON = strings.TrimSpace(cfg.RemoteWorkspaceRealAdapterArgsJSON)
cfg.RemoteWorkspaceRealAdapterWorkDir = strings.TrimSpace(cfg.RemoteWorkspaceRealAdapterWorkDir)
if cfg.BackendURL == "" {
return Config{}, errors.New("backend URL is required")
if cfg.FabricRegistryRecordsJSON == "" {
return Config{}, errors.New("fabric registry records are required")
}
if cfg.NodeName == "" {
return Config{}, errors.New("node name is required")
@@ -204,30 +202,30 @@ func Load(args []string, env map[string]string) (Config, error) {
if cfg.FabricRegistryRecordsJSON != "" && !isJSONArray(cfg.FabricRegistryRecordsJSON) {
return Config{}, errors.New("fabric registry records must be a JSON array")
}
switch cfg.MeshListenPortMode {
switch cfg.FabricListenPortMode {
case "", "manual", "auto", "disabled":
if cfg.MeshListenPortMode == "" {
cfg.MeshListenPortMode = "manual"
if cfg.FabricListenPortMode == "" {
cfg.FabricListenPortMode = "manual"
}
default:
return Config{}, errors.New("mesh listen port mode must be manual, auto, or disabled")
return Config{}, errors.New("fabric listen port mode must be manual, auto, or disabled")
}
if cfg.MeshListenAutoPortStart <= 0 || cfg.MeshListenAutoPortEnd <= 0 {
return Config{}, errors.New("mesh listen auto port range must be positive")
if cfg.FabricListenAutoPortStart <= 0 || cfg.FabricListenAutoPortEnd <= 0 {
return Config{}, errors.New("fabric listen auto port range must be positive")
}
if cfg.MeshListenAutoPortStart > cfg.MeshListenAutoPortEnd {
return Config{}, errors.New("mesh listen auto port start must be less than or equal to end")
if cfg.FabricListenAutoPortStart > cfg.FabricListenAutoPortEnd {
return Config{}, errors.New("fabric listen auto port start must be less than or equal to end")
}
if !isQUICAdvertiseTransport(cfg.MeshAdvertiseTransport) {
return Config{}, errors.New("mesh advertise transport must be a QUIC transport label")
}
if hasLegacyEndpointScheme(cfg.MeshAdvertiseEndpoint) {
if hasUnsupportedEndpointScheme(cfg.MeshAdvertiseEndpoint) {
return Config{}, errors.New("mesh advertise endpoint must be a QUIC endpoint")
}
if cfg.MeshSTUNReflexiveEndpoint != "" && hasLegacyEndpointScheme(cfg.MeshSTUNReflexiveEndpoint) {
if cfg.MeshSTUNReflexiveEndpoint != "" && hasUnsupportedEndpointScheme(cfg.MeshSTUNReflexiveEndpoint) {
return Config{}, errors.New("mesh STUN reflexive endpoint must be a QUIC endpoint")
}
if cfg.MeshRelayEndpoint != "" && hasLegacyEndpointScheme(cfg.MeshRelayEndpoint) {
if cfg.MeshRelayEndpoint != "" && hasUnsupportedEndpointScheme(cfg.MeshRelayEndpoint) {
return Config{}, errors.New("mesh relay endpoint must be a QUIC endpoint")
}
return cfg, nil
@@ -242,36 +240,12 @@ func isQUICAdvertiseTransport(label string) bool {
}
}
func normalizeLegacyAdvertiseTransport(label string) string {
switch strings.ToLower(strings.TrimSpace(label)) {
case "direct_http", "direct_https", "direct_tcp_tls", "http", "https", "ws", "wss", "websocket":
return "direct_quic"
case "outbound_reverse", "reverse", "reverse_outbound":
return "reverse_quic"
case "relay", "relay_control":
return "relay_quic"
default:
return strings.TrimSpace(label)
}
}
func normalizeLegacyEndpointSchemeToQUIC(endpoint string) string {
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
lower := strings.ToLower(endpoint)
for _, prefix := range []string{"http://", "https://", "ws://", "wss://"} {
if strings.HasPrefix(lower, prefix) {
return "quic://" + endpoint[len(prefix):]
}
}
return endpoint
}
func hasLegacyEndpointScheme(endpoint string) bool {
func hasUnsupportedEndpointScheme(endpoint string) bool {
endpoint = strings.ToLower(strings.TrimSpace(endpoint))
return strings.HasPrefix(endpoint, "http://") ||
strings.HasPrefix(endpoint, "https://") ||
strings.HasPrefix(endpoint, "ws://") ||
strings.HasPrefix(endpoint, "wss://")
if endpoint == "" || !strings.Contains(endpoint, "://") {
return false
}
return !strings.HasPrefix(endpoint, "quic://")
}
func isJSONArray(value string) bool {
@@ -1,13 +1,13 @@
package config
import (
"strings"
"testing"
"time"
)
func TestLoadConfigFromEnvAndArgs(t *testing.T) {
cfg, err := Load([]string{"-node-name", "node-b"}, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1/",
"RAP_CLUSTER_ID": "cluster-1",
"RAP_CLUSTER_AUTHORITY_PUBLIC_KEY": "public-key-b64",
"RAP_CLUSTER_AUTHORITY_FINGERPRINT": "rap-ca-ed25519-test",
@@ -23,7 +23,7 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
"RAP_HEARTBEAT_INTERVAL_SECONDS": "7",
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS": "3",
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS": "30",
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED": "true",
"RAP_FABRIC_RUNTIME_ENABLED": "true",
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED": "true",
"RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED": "true",
"RAP_MESH_QUIC_FABRIC_ENABLED": "true",
@@ -32,17 +32,18 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
"RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN": "24",
"RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS": "120",
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "5",
"RAP_MESH_LISTEN_ADDR": "127.0.0.1:19001",
"RAP_MESH_LISTEN_PORT_MODE": "auto",
"RAP_MESH_LISTEN_AUTO_PORT_START": "19010",
"RAP_MESH_LISTEN_AUTO_PORT_END": "19020",
"RAP_FABRIC_LISTEN_ADDR": "127.0.0.1:19001",
"RAP_FABRIC_LISTEN_PORT_MODE": "auto",
"RAP_FABRIC_LISTEN_AUTO_PORT_START": "19010",
"RAP_FABRIC_LISTEN_AUTO_PORT_END": "19020",
"RAP_MESH_ADVERTISE_ENDPOINT": "quic://node-a.example.test:19443/",
"RAP_MESH_ADVERTISE_ENDPOINTS_JSON": `[{"endpoint_id":"node-a-lan","address":"10.10.0.20:19001"}]`,
"RAP_FABRIC_REGISTRY_RECORDS_JSON": ` [{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}] `,
"RAP_MESH_ADVERTISE_TRANSPORT": "direct_quic",
"RAP_MESH_CONNECTIVITY_MODE": "outbound_only",
"RAP_MESH_NAT_TYPE": "symmetric",
"RAP_MESH_LOCAL_SEGMENT_ID": "site-a",
"RAP_MESH_SITE_ID": "home",
"RAP_MESH_LOCALITY_GROUP_ID": "home-lan",
"RAP_MESH_NAT_GROUP_ID": "nat-a",
"RAP_MESH_STUN_REFLEXIVE_ENDPOINT": "quic://203.0.113.20:19443/",
"RAP_MESH_STUN_SERVER": "stun.example.test:3478",
@@ -50,7 +51,7 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
"RAP_MESH_RELAY_ENDPOINT": "quic://node-r.example.test:19443/",
"RAP_MESH_REGION": "eu",
"RAP_MESH_SYNTHETIC_CONFIG": "/tmp/rap-node/mesh-synthetic.json",
"RAP_MESH_PEER_ENDPOINTS_JSON": `{"node-b":"http://127.0.0.1:19002"}`,
"RAP_MESH_PEER_ENDPOINTS_JSON": `{"node-b":"quic://127.0.0.1:19002"}`,
"RAP_MESH_SYNTHETIC_ROUTES_JSON": `[{"route_id":"route-1"}]`,
"RAP_REMOTE_WORKSPACE_REAL_ADAPTER_ENABLED": "true",
"RAP_REMOTE_WORKSPACE_REAL_ADAPTER_COMMAND": " /opt/rap/bin/rdp-worker ",
@@ -60,9 +61,6 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
if err != nil {
t.Fatalf("load config: %v", err)
}
if cfg.BackendURL != "http://backend/api/v1" {
t.Fatalf("BackendURL = %q", cfg.BackendURL)
}
if cfg.NodeName != "node-b" {
t.Fatalf("NodeName = %q", cfg.NodeName)
}
@@ -87,8 +85,8 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
cfg.WebIngressRuntimeServiceClasses != "platform_admin, cluster_admin" {
t.Fatalf("unexpected web ingress key config: %+v", cfg)
}
if !cfg.MeshSyntheticRuntimeEnabled {
t.Fatal("MeshSyntheticRuntimeEnabled = false, want true")
if !cfg.FabricRuntimeEnabled {
t.Fatal("FabricRuntimeEnabled = false, want true")
}
if !cfg.MeshProductionForwardingEnabled {
t.Fatal("MeshProductionForwardingEnabled = false, want true")
@@ -111,11 +109,11 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
if cfg.MeshProductionObservationSinkCapacity != 5 {
t.Fatalf("MeshProductionObservationSinkCapacity = %d, want 5", cfg.MeshProductionObservationSinkCapacity)
}
if cfg.MeshListenAddr != "127.0.0.1:19001" {
t.Fatalf("MeshListenAddr = %q", cfg.MeshListenAddr)
if cfg.FabricListenAddr != "127.0.0.1:19001" {
t.Fatalf("FabricListenAddr = %q", cfg.FabricListenAddr)
}
if cfg.MeshListenPortMode != "auto" || cfg.MeshListenAutoPortStart != 19010 || cfg.MeshListenAutoPortEnd != 19020 {
t.Fatalf("unexpected mesh listen port config: %+v", cfg)
if cfg.FabricListenPortMode != "auto" || cfg.FabricListenAutoPortStart != 19010 || cfg.FabricListenAutoPortEnd != 19020 {
t.Fatalf("unexpected fabric listen port config: %+v", cfg)
}
if cfg.MeshAdvertiseEndpoint != "quic://node-a.example.test:19443" ||
cfg.MeshAdvertiseEndpointsJSON == "" ||
@@ -123,7 +121,8 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
cfg.MeshAdvertiseTransport != "direct_quic" ||
cfg.MeshConnectivityMode != "outbound_only" ||
cfg.MeshNATType != "symmetric" ||
cfg.MeshLocalSegmentID != "site-a" ||
cfg.MeshSiteID != "home" ||
cfg.MeshLocalityGroupID != "home-lan" ||
cfg.MeshNATGroupID != "nat-a" ||
cfg.MeshSTUNReflexiveEndpoint != "quic://203.0.113.20:19443" ||
cfg.MeshSTUNServer != "stun.example.test:3478" ||
@@ -146,10 +145,24 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
}
}
func TestLoadConfigLoadsLocalityGroup(t *testing.T) {
cfg, err := Load(nil, map[string]string{
"RAP_NODE_NAME": "node-a",
"RAP_FABRIC_REGISTRY_RECORDS_JSON": `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
"RAP_MESH_LOCALITY_GROUP_ID": "home-lan",
})
if err != nil {
t.Fatalf("load config: %v", err)
}
if cfg.MeshLocalityGroupID != "home-lan" {
t.Fatalf("unexpected locality group: %+v", cfg)
}
}
func TestLoadConfigDefaultsEnrollmentPollingToNoTimeout(t *testing.T) {
cfg, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
"RAP_NODE_NAME": "node-a",
"RAP_FABRIC_REGISTRY_RECORDS_JSON": `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
})
if err != nil {
t.Fatalf("load config: %v", err)
@@ -168,10 +181,31 @@ func TestLoadConfigDefaultsEnrollmentPollingToNoTimeout(t *testing.T) {
}
}
func TestLoadConfigRequiresFabricBootstrap(t *testing.T) {
_, err := Load([]string{
"--node-name", "node-a",
"--state-dir", t.TempDir(),
"--fabric-registry-records-json", `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
}, map[string]string{})
if err != nil {
t.Fatalf("load config: %v", err)
}
}
func TestLoadConfigRejectsMissingFabricBootstrap(t *testing.T) {
_, err := Load([]string{
"--node-name", "node-a",
"--state-dir", t.TempDir(),
}, map[string]string{})
if err == nil || !strings.Contains(err.Error(), "fabric registry records are required") {
t.Fatalf("expected fabric validation error, got %v", err)
}
}
func TestLoadConfigRejectsNegativeProductionObservationSinkCapacity(t *testing.T) {
_, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
"RAP_NODE_NAME": "node-a",
"RAP_FABRIC_REGISTRY_RECORDS_JSON": `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "-1",
})
if err == nil {
@@ -181,8 +215,8 @@ func TestLoadConfigRejectsNegativeProductionObservationSinkCapacity(t *testing.T
func TestLoadConfigRejectsTooLargeProductionObservationSinkCapacity(t *testing.T) {
_, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
"RAP_NODE_NAME": "node-a",
"RAP_FABRIC_REGISTRY_RECORDS_JSON": `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "10001",
})
if err == nil {
@@ -190,32 +224,26 @@ func TestLoadConfigRejectsTooLargeProductionObservationSinkCapacity(t *testing.T
}
}
func TestLoadConfigNormalizesLegacyMeshAdvertiseTransport(t *testing.T) {
cfg, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
"RAP_MESH_ADVERTISE_ENDPOINT": "quic://node-a.example.test:19443",
"RAP_MESH_ADVERTISE_TRANSPORT": "wss",
func TestLoadConfigRejectsDisallowedMeshAdvertiseTransport(t *testing.T) {
_, err := Load(nil, map[string]string{
"RAP_NODE_NAME": "node-a",
"RAP_FABRIC_REGISTRY_RECORDS_JSON": `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
"RAP_MESH_ADVERTISE_ENDPOINT": "quic://node-a.example.test:19443",
"RAP_MESH_ADVERTISE_TRANSPORT": "wss",
})
if err != nil {
t.Fatalf("Load returned error for legacy mesh advertise transport migration: %v", err)
}
if cfg.MeshAdvertiseTransport != "direct_quic" {
t.Fatalf("transport = %q, want direct_quic", cfg.MeshAdvertiseTransport)
if err == nil || !strings.Contains(err.Error(), "QUIC transport label") {
t.Fatalf("expected QUIC transport rejection, got %v", err)
}
}
func TestLoadConfigNormalizesLegacyMeshAdvertiseEndpointScheme(t *testing.T) {
cfg, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
"RAP_MESH_ADVERTISE_ENDPOINT": "https://node-a.example.test:443",
"RAP_MESH_ADVERTISE_TRANSPORT": "direct_quic",
func TestLoadConfigRejectsDisallowedMeshAdvertiseEndpointScheme(t *testing.T) {
_, err := Load(nil, map[string]string{
"RAP_NODE_NAME": "node-a",
"RAP_FABRIC_REGISTRY_RECORDS_JSON": `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
"RAP_MESH_ADVERTISE_ENDPOINT": "https://node-a.example.test:443",
"RAP_MESH_ADVERTISE_TRANSPORT": "direct_quic",
})
if err != nil {
t.Fatalf("Load returned error for legacy mesh advertise endpoint migration: %v", err)
}
if cfg.MeshAdvertiseEndpoint != "quic://node-a.example.test:443" {
t.Fatalf("endpoint = %q, want quic scheme", cfg.MeshAdvertiseEndpoint)
if err == nil || !strings.Contains(err.Error(), "QUIC endpoint") {
t.Fatalf("expected QUIC endpoint rejection, got %v", err)
}
}
@@ -11,8 +11,13 @@ const (
Magic uint32 = 0x52415046 // RAPF
Version uint8 = 1
HeaderSize = 32
DefaultMaxPayload = 1024 * 1024
HeaderSize = 32
// DefaultMaxPayload is a per-frame guardrail, not a throughput limit.
// Fabric services must scale by many QUIC streams and many frames; keeping
// this above common VPN/RDP/VNC burst batches avoids a hidden 1 MiB choke
// while still bounding memory for a single decoded frame.
DefaultMaxPayload = 8 * 1024 * 1024
)
type FrameType uint8
@@ -102,6 +102,26 @@ func TestRejectsOversizedPayload(t *testing.T) {
}
}
func TestDefaultPayloadAllowsMultiMegabyteServiceBatches(t *testing.T) {
payload := bytes.Repeat([]byte("x"), 2*1024*1024)
frame := Frame{
Type: FrameData,
StreamID: 1,
Payload: payload,
}
encoded, err := MarshalFrame(frame)
if err != nil {
t.Fatalf("marshal multi-megabyte frame: %v", err)
}
decoded, err := UnmarshalFrame(encoded, DefaultMaxPayload)
if err != nil {
t.Fatalf("unmarshal multi-megabyte frame: %v", err)
}
if len(decoded.Payload) != len(payload) {
t.Fatalf("payload length = %d, want %d", len(decoded.Payload), len(payload))
}
}
func TestRejectsUnknownTrafficClass(t *testing.T) {
frame := Frame{
Type: FrameOpenStream,
@@ -8,6 +8,7 @@ import (
const (
DefaultInitialStreamCredit = 32
DefaultMaxStreamCredit = 4096
)
var (
@@ -29,6 +30,7 @@ const (
type SessionConfig struct {
InitialStreamCredit int
MaxStreamCredit int
ClassQueueCapacity map[TrafficClass]int
}
@@ -188,6 +190,7 @@ func (s *Session) Ack(streamID uint64, sequence uint64) error {
delta := sequence - st.metrics.Acked
st.metrics.Acked = sequence
s.metrics.FramesAcked += delta
st.credit = minInt(st.credit+int(delta), s.cfg.MaxStreamCredit)
}
return nil
}
@@ -205,7 +208,7 @@ func (s *Session) AddCredit(streamID uint64, frames int) error {
if st.state != StreamStateOpen {
return ErrStreamClosed
}
st.credit += frames
st.credit = minInt(st.credit+frames, s.cfg.MaxStreamCredit)
return nil
}
@@ -311,6 +314,12 @@ func normalizeSessionConfig(cfg SessionConfig) SessionConfig {
if cfg.InitialStreamCredit <= 0 {
cfg.InitialStreamCredit = DefaultInitialStreamCredit
}
if cfg.MaxStreamCredit <= 0 {
cfg.MaxStreamCredit = maxInt(DefaultMaxStreamCredit, cfg.InitialStreamCredit)
}
if cfg.InitialStreamCredit > cfg.MaxStreamCredit {
cfg.InitialStreamCredit = cfg.MaxStreamCredit
}
if cfg.ClassQueueCapacity == nil {
cfg.ClassQueueCapacity = map[TrafficClass]int{}
}
@@ -331,14 +340,28 @@ func priorityOrder() []TrafficClass {
func defaultClassQueueCapacity(trafficClass TrafficClass) int {
switch trafficClass {
case TrafficClassControl, TrafficClassDNS, TrafficClassInteractive:
return 128
return 1024
case TrafficClassReliable:
return 64
return 512
case TrafficClassBulk:
return 16
return 256
case TrafficClassDroppable:
return 8
return 128
default:
return 32
return 256
}
}
func minInt(a, b int) int {
if a < b {
return a
}
return b
}
func maxInt(a, b int) int {
if a > b {
return a
}
return b
}
@@ -129,20 +129,36 @@ func TestSessionResetDropsOnlySelectedStream(t *testing.T) {
}
func TestSessionAckUpdatesMetrics(t *testing.T) {
session := NewSession(SessionConfig{})
session := NewSession(SessionConfig{InitialStreamCredit: 2})
mustOpenStream(t, session, 1, TrafficClassReliable)
mustEnqueue(t, session, 1, "one")
mustEnqueue(t, session, 1, "two")
if _, err := session.EnqueueData(1, []byte("blocked")); !errors.Is(err, ErrStreamCreditExhausted) {
t.Fatalf("credit error = %v, want %v", err, ErrStreamCreditExhausted)
}
if err := session.Ack(1, 2); err != nil {
t.Fatalf("ack: %v", err)
}
mustEnqueue(t, session, 1, "three")
snapshot := session.Snapshot()
if snapshot.FramesAcked != 2 || snapshot.Streams[1].Acked != 2 {
if snapshot.FramesAcked != 2 || snapshot.Streams[1].Acked != 2 || snapshot.Streams[1].Credit != 1 {
t.Fatalf("ack metrics = %+v stream=%+v", snapshot, snapshot.Streams[1])
}
}
func TestSessionCreditIsCapped(t *testing.T) {
session := NewSession(SessionConfig{InitialStreamCredit: 1, MaxStreamCredit: 2})
mustOpenStream(t, session, 1, TrafficClassReliable)
if err := session.AddCredit(1, 100); err != nil {
t.Fatalf("add credit: %v", err)
}
snapshot := session.Snapshot()
if snapshot.Streams[1].Credit != 2 {
t.Fatalf("credit = %d, want cap 2", snapshot.Streams[1].Credit)
}
}
func TestSessionCloseRejectsNewData(t *testing.T) {
session := NewSession(SessionConfig{})
mustOpenStream(t, session, 1, TrafficClassReliable)
@@ -15,8 +15,8 @@ const (
)
type RuntimeConfig struct {
BackendURL string
ClusterID string
ClusterAuthorityPublicKey string
JoinToken string
NodeName string
Image string
@@ -28,7 +28,7 @@ type RuntimeConfig struct {
Replace bool
DockerVPNGatewayEnabled bool
WorkloadSupervisionEnabled bool
MeshSyntheticRuntimeEnabled bool
FabricRuntimeEnabled bool
MeshProductionForwardingEnabled bool
VPNFabricSessionTransportEnabled bool
MeshQUICFabricEnabled bool
@@ -36,16 +36,19 @@ type RuntimeConfig struct {
VPNFabricSessionStreamShards int
VPNFabricQUICMaxStreamsPerConn int
VPNFabricQUICIdleTTLSeconds int
MeshListenAddr string
MeshListenPortMode string
MeshListenAutoPortStart int
MeshListenAutoPortEnd int
FabricListenAddr string
FabricListenPortMode string
FabricListenAutoPortStart int
FabricListenAutoPortEnd int
MeshAdvertiseEndpoint string
MeshAdvertiseEndpointsJSON string
FabricRegistryRecordsJSON string
MeshAdvertiseTransport string
MeshConnectivityMode string
MeshNATType string
MeshSiteID string
MeshLocalityGroupID string
MeshNATGroupID string
MeshRegion string
HeartbeatIntervalSeconds int
EnrollmentPollIntervalSeconds int
@@ -59,8 +62,8 @@ type RuntimeConfig struct {
}
func (cfg RuntimeConfig) Normalize() RuntimeConfig {
cfg.BackendURL = strings.TrimRight(strings.TrimSpace(cfg.BackendURL), "/")
cfg.ClusterID = strings.TrimSpace(cfg.ClusterID)
cfg.ClusterAuthorityPublicKey = strings.TrimSpace(cfg.ClusterAuthorityPublicKey)
cfg.JoinToken = strings.TrimSpace(cfg.JoinToken)
cfg.NodeName = strings.TrimSpace(cfg.NodeName)
cfg.Image = firstNonEmpty(cfg.Image, DefaultImage)
@@ -68,13 +71,13 @@ func (cfg RuntimeConfig) Normalize() RuntimeConfig {
cfg.StateDir = firstNonEmpty(cfg.StateDir, DefaultStateDir)
cfg.Network = firstNonEmpty(cfg.Network, DefaultNetwork)
cfg.RestartPolicy = firstNonEmpty(cfg.RestartPolicy, "unless-stopped")
cfg.MeshListenAddr = strings.TrimSpace(cfg.MeshListenAddr)
cfg.FabricListenAddr = strings.TrimSpace(cfg.FabricListenAddr)
cfg.MeshQUICFabricListenAddr = strings.TrimSpace(cfg.MeshQUICFabricListenAddr)
if cfg.VPNFabricSessionStreamShards <= 0 {
cfg.VPNFabricSessionStreamShards = 4
cfg.VPNFabricSessionStreamShards = 8
}
if cfg.VPNFabricSessionStreamShards > 64 {
cfg.VPNFabricSessionStreamShards = 64
if cfg.VPNFabricSessionStreamShards > 128 {
cfg.VPNFabricSessionStreamShards = 128
}
if cfg.VPNFabricQUICMaxStreamsPerConn <= 0 {
cfg.VPNFabricQUICMaxStreamsPerConn = 64
@@ -82,13 +85,16 @@ func (cfg RuntimeConfig) Normalize() RuntimeConfig {
if cfg.VPNFabricQUICIdleTTLSeconds <= 0 {
cfg.VPNFabricQUICIdleTTLSeconds = 300
}
cfg.MeshListenPortMode = strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode))
cfg.FabricListenPortMode = strings.ToLower(strings.TrimSpace(cfg.FabricListenPortMode))
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
cfg.FabricRegistryRecordsJSON = strings.TrimSpace(cfg.FabricRegistryRecordsJSON)
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode)
cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType)
cfg.MeshSiteID = strings.TrimSpace(cfg.MeshSiteID)
cfg.MeshLocalityGroupID = strings.TrimSpace(cfg.MeshLocalityGroupID)
cfg.MeshNATGroupID = strings.TrimSpace(cfg.MeshNATGroupID)
cfg.MeshRegion = strings.TrimSpace(cfg.MeshRegion)
cfg.ImageArtifactSHA256 = strings.TrimSpace(cfg.ImageArtifactSHA256)
if cfg.HeartbeatIntervalSeconds == 0 {
@@ -103,12 +109,15 @@ func (cfg RuntimeConfig) Normalize() RuntimeConfig {
func (cfg RuntimeConfig) ValidateInstall() error {
cfg = cfg.Normalize()
var missing []string
if cfg.BackendURL == "" {
missing = append(missing, "backend-url")
if cfg.FabricRegistryRecordsJSON == "" {
missing = append(missing, "fabric-registry-records-json")
}
if cfg.ClusterID == "" {
missing = append(missing, "cluster-id")
}
if cfg.ClusterAuthorityPublicKey == "" && !cfg.Replace {
missing = append(missing, "cluster-authority-public-key")
}
if cfg.NodeName == "" {
missing = append(missing, "node-name")
}
@@ -127,21 +136,21 @@ func (cfg RuntimeConfig) ValidateInstall() error {
if cfg.EnrollmentPollTimeoutSeconds < 0 {
return errors.New("enrollment poll timeout must not be negative")
}
switch cfg.MeshListenPortMode {
switch cfg.FabricListenPortMode {
case "", "manual", "auto", "disabled":
default:
return errors.New("mesh listen port mode must be manual, auto, or disabled")
return errors.New("fabric listen port mode must be manual, auto, or disabled")
}
if cfg.MeshListenAutoPortStart < 0 || cfg.MeshListenAutoPortEnd < 0 {
return errors.New("mesh listen auto port range must not be negative")
if cfg.FabricListenAutoPortStart < 0 || cfg.FabricListenAutoPortEnd < 0 {
return errors.New("fabric listen auto port range must not be negative")
}
if cfg.MeshListenAutoPortStart > 0 && cfg.MeshListenAutoPortEnd > 0 && cfg.MeshListenAutoPortStart > cfg.MeshListenAutoPortEnd {
return errors.New("mesh listen auto port start must be less than or equal to end")
if cfg.FabricListenAutoPortStart > 0 && cfg.FabricListenAutoPortEnd > 0 && cfg.FabricListenAutoPortStart > cfg.FabricListenAutoPortEnd {
return errors.New("fabric listen auto port start must be less than or equal to end")
}
if cfg.MeshAdvertiseTransport != "" && !isQUICAdvertiseTransport(cfg.MeshAdvertiseTransport) {
return errors.New("mesh advertise transport must be a QUIC transport label")
}
if hasLegacyEndpointScheme(cfg.MeshAdvertiseEndpoint) {
if hasUnsupportedEndpointScheme(cfg.MeshAdvertiseEndpoint) {
return errors.New("mesh advertise endpoint must be a QUIC endpoint")
}
if cfg.ProductionObservationSinkCap < 0 {
@@ -174,12 +183,12 @@ func isQUICAdvertiseTransport(label string) bool {
}
}
func hasLegacyEndpointScheme(endpoint string) bool {
func hasUnsupportedEndpointScheme(endpoint string) bool {
endpoint = strings.ToLower(strings.TrimSpace(endpoint))
return strings.HasPrefix(endpoint, "http://") ||
strings.HasPrefix(endpoint, "https://") ||
strings.HasPrefix(endpoint, "ws://") ||
strings.HasPrefix(endpoint, "wss://")
if endpoint == "" || !strings.Contains(endpoint, "://") {
return false
}
return !strings.HasPrefix(endpoint, "quic://")
}
func isJSONArray(value string) bool {
@@ -6,7 +6,6 @@ import (
"encoding/hex"
"fmt"
"io"
"net/http"
"os"
"os/exec"
"path/filepath"
@@ -126,15 +125,15 @@ func (m DockerManager) ensureImageFromArtifact(ctx context.Context, runner Comma
return true, nil
}
func downloadFirstArtifact(ctx context.Context, urls []string, expectedSHA256 string, expectedSizeBytes int64) (string, error) {
func downloadFirstArtifact(ctx context.Context, paths []string, expectedSHA256 string, expectedSizeBytes int64) (string, error) {
var lastErr error
for _, rawURL := range urls {
rawURL = strings.TrimSpace(rawURL)
if rawURL == "" {
for _, rawPath := range paths {
rawPath = strings.TrimSpace(rawPath)
if rawPath == "" {
continue
}
for attempt := 1; attempt <= 3; attempt++ {
path, err := downloadArtifact(ctx, rawURL, expectedSHA256, expectedSizeBytes)
path, err := downloadArtifact(ctx, rawPath, expectedSHA256, expectedSizeBytes)
if err == nil {
return path, nil
}
@@ -144,29 +143,34 @@ func downloadFirstArtifact(ctx context.Context, urls []string, expectedSHA256 st
if lastErr != nil {
return "", lastErr
}
return "", fmt.Errorf("no artifact URLs configured")
return "", fmt.Errorf("no artifact paths configured")
}
func downloadArtifact(ctx context.Context, rawURL, expectedSHA256 string, expectedSizeBytes int64) (string, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
func downloadArtifact(ctx context.Context, rawPath, expectedSHA256 string, expectedSizeBytes int64) (string, error) {
select {
case <-ctx.Done():
return "", ctx.Err()
default:
}
source := strings.TrimSpace(rawPath)
if source == "" {
return "", fmt.Errorf("artifact path is empty")
}
if strings.Contains(source, "://") {
return "", fmt.Errorf("network artifact reference %q is disabled; update artifacts must arrive via quic fabric", source)
}
input, err := os.Open(source)
if err != nil {
return "", err
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return "", fmt.Errorf("download artifact %s: %w", rawURL, err)
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return "", fmt.Errorf("download artifact %s: %s", rawURL, resp.Status)
return "", fmt.Errorf("open artifact %s: %w", source, err)
}
defer input.Close()
file, err := os.CreateTemp("", "rap-docker-image-*.tar")
if err != nil {
return "", err
}
path := file.Name()
hasher := sha256.New()
written, copyErr := io.Copy(io.MultiWriter(file, hasher), resp.Body)
written, copyErr := io.Copy(io.MultiWriter(file, hasher), input)
closeErr := file.Close()
if copyErr != nil {
os.Remove(path)
@@ -176,21 +180,17 @@ func downloadArtifact(ctx context.Context, rawURL, expectedSHA256 string, expect
os.Remove(path)
return "", closeErr
}
if resp.ContentLength >= 0 && written != resp.ContentLength {
os.Remove(path)
return "", fmt.Errorf("artifact download truncated for %s: got %d bytes want content-length %d", rawURL, written, resp.ContentLength)
}
if expectedSizeBytes > 0 && written != expectedSizeBytes {
if strings.TrimSpace(expectedSHA256) != "" {
os.Remove(path)
return "", fmt.Errorf("artifact size mismatch for %s: got %d bytes want %d", rawURL, written, expectedSizeBytes)
return "", fmt.Errorf("artifact size mismatch for %s: got %d bytes want %d", source, written, expectedSizeBytes)
}
fmt.Printf("artifact size mismatch for %s: got %d bytes want %d; proceeding without checksum for backward-compatible installs\n", rawURL, written, expectedSizeBytes)
fmt.Printf("artifact size mismatch for %s: got %d bytes want %d; proceeding because checksum is absent\n", source, written, expectedSizeBytes)
}
actual := hex.EncodeToString(hasher.Sum(nil))
if expected := strings.TrimSpace(expectedSHA256); expected != "" && !strings.EqualFold(actual, expected) {
os.Remove(path)
return "", fmt.Errorf("artifact checksum mismatch for %s: got %s want %s", rawURL, actual, expected)
return "", fmt.Errorf("artifact checksum mismatch for %s: got %s want %s", source, actual, expected)
}
return path, nil
}
@@ -254,7 +254,6 @@ func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
cfg = cfg.Normalize()
stateDir = firstNonEmpty(stateDir, cfg.StateDir)
env := []string{
"RAP_BACKEND_URL=" + cfg.BackendURL,
"RAP_CLUSTER_ID=" + cfg.ClusterID,
"RAP_NODE_NAME=" + cfg.NodeName,
"RAP_NODE_STATE_DIR=" + stateDir,
@@ -262,7 +261,7 @@ func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS=" + strconv.Itoa(cfg.EnrollmentPollIntervalSeconds),
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=" + strconv.Itoa(cfg.EnrollmentPollTimeoutSeconds),
"RAP_WORKLOAD_SUPERVISION_ENABLED=" + boolString(cfg.WorkloadSupervisionEnabled),
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=" + boolString(cfg.MeshSyntheticRuntimeEnabled),
"RAP_FABRIC_RUNTIME_ENABLED=" + boolString(cfg.FabricRuntimeEnabled),
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED=" + boolString(cfg.MeshProductionForwardingEnabled),
"RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED=" + boolString(cfg.VPNFabricSessionTransportEnabled),
"RAP_MESH_QUIC_FABRIC_ENABLED=" + boolString(cfg.MeshQUICFabricEnabled),
@@ -270,23 +269,26 @@ func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
"RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN=" + strconv.Itoa(cfg.VPNFabricQUICMaxStreamsPerConn),
"RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS=" + strconv.Itoa(cfg.VPNFabricQUICIdleTTLSeconds),
}
if cfg.ClusterAuthorityPublicKey != "" {
env = append(env, "RAP_CLUSTER_AUTHORITY_PUBLIC_KEY="+cfg.ClusterAuthorityPublicKey)
}
if cfg.JoinToken != "" {
env = append(env, "RAP_JOIN_TOKEN="+cfg.JoinToken)
}
if cfg.MeshListenAddr != "" {
env = append(env, "RAP_MESH_LISTEN_ADDR="+cfg.MeshListenAddr)
if cfg.FabricListenAddr != "" {
env = append(env, "RAP_FABRIC_LISTEN_ADDR="+cfg.FabricListenAddr)
}
if cfg.MeshQUICFabricListenAddr != "" {
env = append(env, "RAP_MESH_QUIC_FABRIC_LISTEN_ADDR="+cfg.MeshQUICFabricListenAddr)
}
if cfg.MeshListenPortMode != "" {
env = append(env, "RAP_MESH_LISTEN_PORT_MODE="+cfg.MeshListenPortMode)
if cfg.FabricListenPortMode != "" {
env = append(env, "RAP_FABRIC_LISTEN_PORT_MODE="+cfg.FabricListenPortMode)
}
if cfg.MeshListenAutoPortStart > 0 {
env = append(env, "RAP_MESH_LISTEN_AUTO_PORT_START="+strconv.Itoa(cfg.MeshListenAutoPortStart))
if cfg.FabricListenAutoPortStart > 0 {
env = append(env, "RAP_FABRIC_LISTEN_AUTO_PORT_START="+strconv.Itoa(cfg.FabricListenAutoPortStart))
}
if cfg.MeshListenAutoPortEnd > 0 {
env = append(env, "RAP_MESH_LISTEN_AUTO_PORT_END="+strconv.Itoa(cfg.MeshListenAutoPortEnd))
if cfg.FabricListenAutoPortEnd > 0 {
env = append(env, "RAP_FABRIC_LISTEN_AUTO_PORT_END="+strconv.Itoa(cfg.FabricListenAutoPortEnd))
}
if cfg.MeshAdvertiseEndpoint != "" {
env = append(env, "RAP_MESH_ADVERTISE_ENDPOINT="+cfg.MeshAdvertiseEndpoint)
@@ -306,6 +308,15 @@ func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
if cfg.MeshNATType != "" {
env = append(env, "RAP_MESH_NAT_TYPE="+cfg.MeshNATType)
}
if cfg.MeshSiteID != "" {
env = append(env, "RAP_MESH_SITE_ID="+cfg.MeshSiteID)
}
if cfg.MeshLocalityGroupID != "" {
env = append(env, "RAP_MESH_LOCALITY_GROUP_ID="+cfg.MeshLocalityGroupID)
}
if cfg.MeshNATGroupID != "" {
env = append(env, "RAP_MESH_NAT_GROUP_ID="+cfg.MeshNATGroupID)
}
if cfg.MeshRegion != "" {
env = append(env, "RAP_MESH_REGION="+cfg.MeshRegion)
}
@@ -2,14 +2,19 @@ package hostagent
import (
"context"
"crypto/ed25519"
cryptorand "crypto/rand"
"encoding/base64"
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
"time"
clusterauth "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh"
)
type recordingRunner struct {
@@ -48,6 +53,27 @@ type imagePresentRunner struct {
calls [][]string
}
type inspectRuntimeRunner struct {
output string
}
func (r *inspectRuntimeRunner) Run(_ context.Context, name string, args ...string) (string, error) {
if name == "docker" && len(args) >= 2 && args[0] == "inspect" {
return r.output, nil
}
return "", nil
}
func testFabricRuntimeConfig() RuntimeConfig {
return RuntimeConfig{
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
}
}
func (r *imagePresentRunner) Run(_ context.Context, name string, args ...string) (string, error) {
r.calls = append(r.calls, append([]string{name}, args...))
if len(args) > 0 && args[0] == "run" {
@@ -58,21 +84,21 @@ func (r *imagePresentRunner) Run(_ context.Context, name string, args ...string)
func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
args := DockerRunArgs(RuntimeConfig{
BackendURL: "http://control/api/v1/",
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "/srv/rap/node-a",
MeshSyntheticRuntimeEnabled: true,
FabricRuntimeEnabled: true,
VPNFabricSessionTransportEnabled: true,
MeshQUICFabricEnabled: true,
MeshQUICFabricListenAddr: ":19443",
VPNFabricSessionStreamShards: 6,
VPNFabricQUICMaxStreamsPerConn: 24,
VPNFabricQUICIdleTTLSeconds: 120,
MeshListenAddr: ":19131",
FabricListenAddr: ":19131",
MeshAdvertiseEndpoint: "quic://10.0.0.11:19443/",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
MeshAdvertiseTransport: "direct_quic",
@@ -83,19 +109,19 @@ func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
for _, want := range []string{
"run", "-d", "--name\x00rap-node-agent-node-a", "--network\x00host",
"-v\x00/srv/rap/node-a:/var/lib/rap-node-agent",
"RAP_BACKEND_URL=http://control/api/v1",
"RAP_CLUSTER_ID=cluster-1",
"RAP_CLUSTER_AUTHORITY_PUBLIC_KEY=authority-key-b64",
"RAP_JOIN_TOKEN=join-secret",
"RAP_NODE_STATE_DIR=/var/lib/rap-node-agent",
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=0",
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true",
"RAP_FABRIC_RUNTIME_ENABLED=true",
"RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED=true",
"RAP_MESH_QUIC_FABRIC_ENABLED=true",
"RAP_MESH_QUIC_FABRIC_LISTEN_ADDR=:19443",
"RAP_VPN_FABRIC_SESSION_STREAM_SHARDS=6",
"RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN=24",
"RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS=120",
"RAP_MESH_LISTEN_ADDR=:19131",
"RAP_FABRIC_LISTEN_ADDR=:19131",
"RAP_MESH_ADVERTISE_ENDPOINT=quic://10.0.0.11:19443",
`RAP_FABRIC_REGISTRY_RECORDS_JSON=[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
"RAP_MESH_ADVERTISE_TRANSPORT=direct_quic",
@@ -110,7 +136,6 @@ func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
func TestDockerRunArgsEnableVPNGatewayDevice(t *testing.T) {
args := DockerRunArgs(RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
@@ -130,6 +155,40 @@ func TestDockerRunArgsEnableVPNGatewayDevice(t *testing.T) {
}
}
func TestRuntimeConfigFromContainerReadsFabricListenEnv(t *testing.T) {
runner := &inspectRuntimeRunner{output: fmt.Sprintf(`[{
"Config":{
"Image":"rap-node-agent:test",
"Env":[
"RAP_CLUSTER_ID=cluster-1",
"RAP_NODE_NAME=node-a",
"RAP_FABRIC_LISTEN_ADDR=:19131",
"RAP_FABRIC_LISTEN_PORT_MODE=auto",
"RAP_FABRIC_LISTEN_AUTO_PORT_START=19131",
"RAP_FABRIC_LISTEN_AUTO_PORT_END=19231"
]
},
"HostConfig":{
"NetworkMode":"host",
"RestartPolicy":{"Name":"unless-stopped"},
"CapAdd":[],
"Devices":[],
"Privileged":false
},
"Mounts":[{"Source":"/srv/rap/node-a","Destination":"/var/lib/rap-node-agent"}]
}]`)}
_, cfg, err := (DockerManager{}).runtimeConfigFromContainer(context.Background(), runner, "docker", "rap-node-agent-node-a")
if err != nil {
t.Fatalf("runtime config from container: %v", err)
}
if cfg.FabricListenAddr != ":19131" || cfg.FabricListenPortMode != "auto" {
t.Fatalf("fabric listen env was not read: %+v", cfg)
}
if cfg.FabricListenAutoPortStart != 19131 || cfg.FabricListenAutoPortEnd != 19231 {
t.Fatalf("fabric listen auto range was not read: %+v", cfg)
}
}
func TestPrepareStateDirCreatesWritableHostPath(t *testing.T) {
dir := filepath.Join(t.TempDir(), "node-state")
if err := PrepareStateDir(dir); err != nil {
@@ -153,92 +212,23 @@ func TestPrepareStateDirSkipsNamedVolume(t *testing.T) {
}
}
func TestFetchDockerInstallProfileBuildsRuntimeConfig(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/v1/node-agents/docker-install-profile" {
t.Fatalf("path = %s", r.URL.Path)
}
_ = json.NewEncoder(w).Encode(map[string]any{
"docker_install_profile": map[string]any{
"cluster_id": "cluster-1",
"backend_url": "https://control.example.test/api/v1",
"join_token": "rap_join_profile",
"node_name": "node-a",
"image": "rap-node-agent:test",
"artifact_endpoints": []string{"https://cache.example.test/artifacts"},
"fabric_registry_records": []map[string]any{{
"schema": "rap.fabric.registry.gossip_record.v1",
"service_class": "control-api",
"service_id": "control-a",
}},
"docker_image_artifact": map[string]any{
"kind": "docker_image_tar",
"image": "rap-node-agent:test",
"file_name": "rap-node-agent-test.tar",
"size_bytes": 21,
},
"container_name": "rap-node-agent-node-a",
"state_dir": "/var/lib/rap/nodes/node-a",
"network": "host",
"restart_policy": "unless-stopped",
"replace": true,
"mesh_synthetic_runtime_enabled": true,
"vpn_fabric_session_transport_enabled": true,
"mesh_quic_fabric_enabled": true,
"mesh_quic_fabric_listen_addr": ":19443",
"vpn_fabric_session_stream_shards": 6,
"mesh_connectivity_mode": "outbound_only",
},
})
}))
defer server.Close()
profile, err := FetchDockerInstallProfile(context.Background(), ProfileRequest{
URL: server.URL + "/api/v1",
ClusterID: "cluster-1",
InstallToken: "rap_join_profile",
NodeName: "node-a",
})
if err != nil {
t.Fatalf("fetch profile: %v", err)
}
cfg := RuntimeConfigFromProfile(profile).Normalize()
if cfg.BackendURL != "https://control.example.test/api/v1" ||
cfg.ClusterID != "cluster-1" ||
cfg.JoinToken != "rap_join_profile" ||
cfg.ContainerName != "rap-node-agent-node-a" ||
len(cfg.ImageArtifactURLs) != 1 ||
cfg.ImageArtifactSizeBytes != 21 ||
!cfg.MeshSyntheticRuntimeEnabled ||
!cfg.VPNFabricSessionTransportEnabled ||
!cfg.MeshQUICFabricEnabled ||
cfg.MeshQUICFabricListenAddr != ":19443" ||
cfg.VPNFabricSessionStreamShards != 6 ||
cfg.FabricRegistryRecordsJSON != `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api","service_id":"control-a"}]` ||
cfg.MeshConnectivityMode != "outbound_only" {
t.Fatalf("unexpected cfg: %+v", cfg)
}
}
func TestInstallLoadsImageArtifactWhenImageMissing(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("fake docker image tar"))
}))
defer server.Close()
artifactPath := writeDockerImageArtifact(t, "fake docker image tar")
runner := &imageMissingRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{artifactPath},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
})
if err != nil {
t.Fatalf("install: %v", err)
@@ -255,24 +245,22 @@ func TestInstallLoadsImageArtifactWhenImageMissing(t *testing.T) {
func TestInstallAcceptsSizeMismatchWhenChecksumMissing(t *testing.T) {
const payload = "fake docker image tar"
const wrongSize = 999
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte(payload))
}))
defer server.Close()
artifactPath := writeDockerImageArtifact(t, payload)
runner := &imageMissingRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
ImageArtifactSHA256: "", // intentionally absent -> size mismatch should not block install
ImageArtifactSizeBytes: wrongSize,
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{artifactPath},
ImageArtifactSHA256: "", // intentionally absent -> size mismatch should not block install
ImageArtifactSizeBytes: wrongSize,
})
if err != nil {
t.Fatalf("install: %v", err)
@@ -283,24 +271,22 @@ func TestInstallAcceptsSizeMismatchWhenChecksumMissing(t *testing.T) {
}
func TestInstallReloadsImageArtifactWhenReplacingMutableTag(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("fake docker image tar"))
}))
defer server.Close()
artifactPath := writeDockerImageArtifact(t, "fake docker image tar")
runner := &imagePresentRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{artifactPath},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
})
if err != nil {
t.Fatalf("install: %v", err)
@@ -315,27 +301,22 @@ func TestInstallReloadsImageArtifactWhenReplacingMutableTag(t *testing.T) {
}
func TestDockerInstallLoadsExplicitArtifactBeforeReplace(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/rap-node-agent-test.tar" {
t.Fatalf("unexpected path %s", r.URL.Path)
}
_, _ = w.Write([]byte("fake docker image tar"))
}))
defer server.Close()
artifactPath := writeDockerImageArtifact(t, "fake docker image tar")
runner := &imageMissingRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{artifactPath},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
})
if err != nil {
t.Fatalf("install: %v", err)
@@ -349,6 +330,15 @@ func TestDockerInstallLoadsExplicitArtifactBeforeReplace(t *testing.T) {
}
}
func writeDockerImageArtifact(t *testing.T, payload string) string {
t.Helper()
path := filepath.Join(t.TempDir(), "rap-node-agent-test.tar")
if err := os.WriteFile(path, []byte(payload), 0o600); err != nil {
t.Fatalf("write artifact: %v", err)
}
return path
}
func flattenCalls(calls [][]string) []string {
out := []string{}
for _, call := range calls {
@@ -360,14 +350,15 @@ func flattenCalls(calls [][]string) []string {
func TestInstallCanPullReplaceAndRedactsJoinToken(t *testing.T) {
runner := &recordingRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
PullImage: true,
Replace: true,
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
PullImage: true,
Replace: true,
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
})
if err != nil {
t.Fatalf("install: %v", err)
@@ -385,44 +376,350 @@ func TestInstallCanPullReplaceAndRedactsJoinToken(t *testing.T) {
}
func TestValidateRequiresJoinTokenUnlessReplacingExistingState(t *testing.T) {
err := RuntimeConfig{BackendURL: "http://control/api/v1", ClusterID: "cluster-1", NodeName: "node-a"}.ValidateInstall()
err := RuntimeConfig{ClusterID: "cluster-1", ClusterAuthorityPublicKey: "authority-key-b64", FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`, NodeName: "node-a"}.ValidateInstall()
if err == nil || !strings.Contains(err.Error(), "join-token") {
t.Fatalf("expected join token validation error, got %v", err)
}
err = RuntimeConfig{BackendURL: "http://control/api/v1", ClusterID: "cluster-1", NodeName: "node-a", Replace: true}.ValidateInstall()
err = RuntimeConfig{ClusterID: "cluster-1", ClusterAuthorityPublicKey: "authority-key-b64", FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`, NodeName: "node-a", Replace: true}.ValidateInstall()
if err != nil {
t.Fatalf("replace update should allow missing join token: %v", err)
}
}
func TestValidateRejectsLegacyMeshAdvertiseTransport(t *testing.T) {
func TestValidateAllowsFabricBootstrapWithoutBackendURL(t *testing.T) {
err := RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
MeshAdvertiseEndpoint: "quic://10.0.0.11:19443",
MeshAdvertiseTransport: "wss",
MeshQUICFabricEnabled: true,
MeshQUICFabricListenAddr: ":19443",
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
}.ValidateInstall()
if err != nil {
t.Fatalf("fabric-native install should validate: %v", err)
}
}
func TestValidateRequiresAuthorityKeyForFabricBootstrap(t *testing.T) {
err := RuntimeConfig{
ClusterID: "cluster-1",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
}.ValidateInstall()
if err == nil || !strings.Contains(err.Error(), "cluster-authority-public-key") {
t.Fatalf("expected authority key validation error, got %v", err)
}
}
func TestLoadDockerJoinBundleRejectsUnsignedEnvelope(t *testing.T) {
path := filepath.Join(t.TempDir(), "bundle.json")
if err := os.WriteFile(path, []byte(`{
"docker_install_profile": {
"cluster_id": "cluster-1",
"cluster_authority_public_key": "authority-key-b64",
"join_token": "join-secret",
"node_name": "node-a",
"fabric_registry_records": [{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]
}
}`), 0o600); err != nil {
t.Fatalf("write bundle: %v", err)
}
_, err := LoadDockerJoinBundle(path)
if err == nil || !strings.Contains(err.Error(), "join bundle authority envelope is missing") {
t.Fatalf("expected unsigned bundle error, got %v", err)
}
}
func TestLoadDockerJoinBundleVerifiesAuthoritySignature(t *testing.T) {
publicKey, privateKey, err := ed25519.GenerateKey(cryptorand.Reader)
if err != nil {
t.Fatalf("GenerateKey: %v", err)
}
signedProfile := map[string]any{
"cluster_id": "cluster-1",
"cluster_authority_public_key": base64.StdEncoding.EncodeToString(publicKey),
"join_token": "join-secret",
"node_name": "node-a",
}
authorityPayload, err := json.Marshal(map[string]any{
"docker_install_profile": signedProfile,
})
if err != nil {
t.Fatalf("marshal authority payload: %v", err)
}
canonical, err := clusterauth.CanonicalJSON(authorityPayload)
if err != nil {
t.Fatalf("CanonicalJSON: %v", err)
}
signed := ed25519.Sign(privateKey, canonical)
path := filepath.Join(t.TempDir(), "bundle.json")
if err := os.WriteFile(path, []byte(fmt.Sprintf(`{
"schema_version": "rap.install_join_bundle.v1",
"bundle_kind": "docker",
"cluster_id": "cluster-1",
"cluster_authority": {
"schema_version": "%s",
"cluster_id": "cluster-1",
"authority_state": "active",
"key_algorithm": "%s",
"public_key": "%s",
"public_key_fingerprint": "%s",
"created_at": "%s",
"updated_at": "%s"
},
"authority_payload": %s,
"authority_signature": {
"schema_version": "%s",
"algorithm": "%s",
"key_fingerprint": "%s",
"signature": "%s",
"signed_at": "%s"
},
"docker_install_profile": %s
}`, clusterauth.AuthoritySchemaVersion, clusterauth.AlgorithmEd25519, base64.StdEncoding.EncodeToString(publicKey), clusterauth.Fingerprint(publicKey), time.Now().UTC().Format(time.RFC3339), time.Now().UTC().Format(time.RFC3339), string(authorityPayload), clusterauth.SignatureSchemaVersion, clusterauth.AlgorithmEd25519, clusterauth.Fingerprint(publicKey), base64.StdEncoding.EncodeToString(signed), time.Now().UTC().Format(time.RFC3339), mustBundleJSON(t, signedProfile))), 0o600); err != nil {
t.Fatalf("write bundle: %v", err)
}
loaded, err := LoadDockerJoinBundle(path)
if err != nil {
t.Fatalf("LoadDockerJoinBundle: %v", err)
}
if loaded.NodeName != "node-a" {
t.Fatalf("unexpected loaded profile: %+v", loaded)
}
}
func TestLoadDockerJoinBundleRejectsTamperedSignedProfile(t *testing.T) {
publicKey, privateKey, err := ed25519.GenerateKey(cryptorand.Reader)
if err != nil {
t.Fatalf("GenerateKey: %v", err)
}
signedProfile := map[string]any{
"cluster_id": "cluster-1",
"cluster_authority_public_key": base64.StdEncoding.EncodeToString(publicKey),
"join_token": "join-secret",
"node_name": "node-a",
}
tamperedProfile := map[string]any{
"cluster_id": "cluster-1",
"cluster_authority_public_key": base64.StdEncoding.EncodeToString(publicKey),
"join_token": "join-secret",
"node_name": "node-b",
}
authorityPayload, err := json.Marshal(map[string]any{
"docker_install_profile": signedProfile,
})
if err != nil {
t.Fatalf("marshal authority payload: %v", err)
}
canonical, err := clusterauth.CanonicalJSON(authorityPayload)
if err != nil {
t.Fatalf("CanonicalJSON: %v", err)
}
signed := ed25519.Sign(privateKey, canonical)
path := filepath.Join(t.TempDir(), "bundle.json")
if err := os.WriteFile(path, []byte(fmt.Sprintf(`{
"schema_version": "rap.install_join_bundle.v1",
"bundle_kind": "docker",
"cluster_id": "cluster-1",
"cluster_authority": {
"schema_version": "%s",
"cluster_id": "cluster-1",
"authority_state": "active",
"key_algorithm": "%s",
"public_key": "%s",
"public_key_fingerprint": "%s",
"created_at": "%s",
"updated_at": "%s"
},
"authority_payload": %s,
"authority_signature": {
"schema_version": "%s",
"algorithm": "%s",
"key_fingerprint": "%s",
"signature": "%s",
"signed_at": "%s"
},
"docker_install_profile": %s
}`, clusterauth.AuthoritySchemaVersion, clusterauth.AlgorithmEd25519, base64.StdEncoding.EncodeToString(publicKey), clusterauth.Fingerprint(publicKey), time.Now().UTC().Format(time.RFC3339), time.Now().UTC().Format(time.RFC3339), string(authorityPayload), clusterauth.SignatureSchemaVersion, clusterauth.AlgorithmEd25519, clusterauth.Fingerprint(publicKey), base64.StdEncoding.EncodeToString(signed), time.Now().UTC().Format(time.RFC3339), mustBundleJSON(t, tamperedProfile))), 0o600); err != nil {
t.Fatalf("write bundle: %v", err)
}
_, err = LoadDockerJoinBundle(path)
if err == nil || !strings.Contains(err.Error(), "does not match signed authority payload") {
t.Fatalf("expected signed bundle mismatch error, got %v", err)
}
}
func TestLoadDockerJoinBundleRejectsSignedProfileAuthorityKeyMismatch(t *testing.T) {
publicKey, privateKey, err := ed25519.GenerateKey(cryptorand.Reader)
if err != nil {
t.Fatalf("GenerateKey: %v", err)
}
otherPublicKey, _, err := ed25519.GenerateKey(cryptorand.Reader)
if err != nil {
t.Fatalf("GenerateKey(other): %v", err)
}
signedProfile := map[string]any{
"cluster_id": "cluster-1",
"cluster_authority_public_key": base64.StdEncoding.EncodeToString(otherPublicKey),
"join_token": "join-secret",
"node_name": "node-a",
}
authorityPayload, err := json.Marshal(map[string]any{
"docker_install_profile": signedProfile,
})
if err != nil {
t.Fatalf("marshal authority payload: %v", err)
}
canonical, err := clusterauth.CanonicalJSON(authorityPayload)
if err != nil {
t.Fatalf("CanonicalJSON: %v", err)
}
signed := ed25519.Sign(privateKey, canonical)
path := filepath.Join(t.TempDir(), "bundle.json")
if err := os.WriteFile(path, []byte(fmt.Sprintf(`{
"schema_version": "rap.install_join_bundle.v1",
"bundle_kind": "docker",
"cluster_id": "cluster-1",
"cluster_authority": {
"schema_version": "%s",
"cluster_id": "cluster-1",
"authority_state": "active",
"key_algorithm": "%s",
"public_key": "%s",
"public_key_fingerprint": "%s",
"created_at": "%s",
"updated_at": "%s"
},
"authority_payload": %s,
"authority_signature": {
"schema_version": "%s",
"algorithm": "%s",
"key_fingerprint": "%s",
"signature": "%s",
"signed_at": "%s"
},
"docker_install_profile": %s
}`, clusterauth.AuthoritySchemaVersion, clusterauth.AlgorithmEd25519, base64.StdEncoding.EncodeToString(publicKey), clusterauth.Fingerprint(publicKey), time.Now().UTC().Format(time.RFC3339), time.Now().UTC().Format(time.RFC3339), string(authorityPayload), clusterauth.SignatureSchemaVersion, clusterauth.AlgorithmEd25519, clusterauth.Fingerprint(publicKey), base64.StdEncoding.EncodeToString(signed), time.Now().UTC().Format(time.RFC3339), mustBundleJSON(t, signedProfile))), 0o600); err != nil {
t.Fatalf("write bundle: %v", err)
}
_, err = LoadDockerJoinBundle(path)
if err == nil || !strings.Contains(err.Error(), "profile authority key does not match signed bundle authority key") {
t.Fatalf("expected authority key mismatch error, got %v", err)
}
}
func TestLoadDockerJoinBundleRejectsSignedProfileClusterIDMismatch(t *testing.T) {
publicKey, privateKey, err := ed25519.GenerateKey(cryptorand.Reader)
if err != nil {
t.Fatalf("GenerateKey: %v", err)
}
signedProfile := map[string]any{
"cluster_id": "cluster-2",
"cluster_authority_public_key": base64.StdEncoding.EncodeToString(publicKey),
"join_token": "join-secret",
"node_name": "node-a",
}
authorityPayload, err := json.Marshal(map[string]any{
"docker_install_profile": signedProfile,
})
if err != nil {
t.Fatalf("marshal authority payload: %v", err)
}
canonical, err := clusterauth.CanonicalJSON(authorityPayload)
if err != nil {
t.Fatalf("CanonicalJSON: %v", err)
}
signed := ed25519.Sign(privateKey, canonical)
path := filepath.Join(t.TempDir(), "bundle.json")
if err := os.WriteFile(path, []byte(fmt.Sprintf(`{
"schema_version": "rap.install_join_bundle.v1",
"bundle_kind": "docker",
"cluster_id": "cluster-1",
"cluster_authority": {
"schema_version": "%s",
"cluster_id": "cluster-1",
"authority_state": "active",
"key_algorithm": "%s",
"public_key": "%s",
"public_key_fingerprint": "%s",
"created_at": "%s",
"updated_at": "%s"
},
"authority_payload": %s,
"authority_signature": {
"schema_version": "%s",
"algorithm": "%s",
"key_fingerprint": "%s",
"signature": "%s",
"signed_at": "%s"
},
"docker_install_profile": %s
}`, clusterauth.AuthoritySchemaVersion, clusterauth.AlgorithmEd25519, base64.StdEncoding.EncodeToString(publicKey), clusterauth.Fingerprint(publicKey), time.Now().UTC().Format(time.RFC3339), time.Now().UTC().Format(time.RFC3339), string(authorityPayload), clusterauth.SignatureSchemaVersion, clusterauth.AlgorithmEd25519, clusterauth.Fingerprint(publicKey), base64.StdEncoding.EncodeToString(signed), time.Now().UTC().Format(time.RFC3339), mustBundleJSON(t, signedProfile))), 0o600); err != nil {
t.Fatalf("write bundle: %v", err)
}
_, err = LoadDockerJoinBundle(path)
if err == nil || !strings.Contains(err.Error(), "profile cluster_id does not match signed bundle cluster_id") {
t.Fatalf("expected cluster mismatch error, got %v", err)
}
}
func mustBundleJSON(t *testing.T, value any) string {
t.Helper()
raw, err := json.Marshal(value)
if err != nil {
t.Fatalf("marshal bundle json: %v", err)
}
return string(raw)
}
func TestValidateRejectsDisallowedMeshAdvertiseTransport(t *testing.T) {
err := RuntimeConfig{
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
MeshAdvertiseEndpoint: "quic://10.0.0.11:19443",
MeshAdvertiseTransport: "wss",
MeshQUICFabricEnabled: true,
MeshQUICFabricListenAddr: ":19443",
}.ValidateInstall()
if err == nil || !strings.Contains(err.Error(), "QUIC transport") {
t.Fatalf("expected QUIC transport validation error, got %v", err)
}
}
func TestValidateRejectsLegacyMeshAdvertiseEndpointScheme(t *testing.T) {
func TestValidateRejectsDisallowedMeshAdvertiseEndpointScheme(t *testing.T) {
err := RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
MeshAdvertiseEndpoint: "http://10.0.0.11:19131",
MeshAdvertiseTransport: "direct_quic",
MeshQUICFabricEnabled: true,
MeshQUICFabricListenAddr: ":19443",
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
MeshAdvertiseEndpoint: "http://10.0.0.11:19131",
MeshAdvertiseTransport: "direct_quic",
MeshQUICFabricEnabled: true,
MeshQUICFabricListenAddr: ":19443",
}.ValidateInstall()
if err == nil || !strings.Contains(err.Error(), "QUIC endpoint") {
t.Fatalf("expected QUIC endpoint validation error, got %v", err)
}
}
func TestPreferredUpdateServiceEndpointsPrioritizesHintOrder(t *testing.T) {
input := []mesh.FabricRegistryEndpoint{
{EndpointID: "ep-1", Address: "quic://10.0.0.5:19443"},
{EndpointID: "ep-2", Address: "quic://10.0.0.6:19443"},
{EndpointID: "ep-3", Address: "quic://10.0.0.7:19443"},
}
ordered := preferredUpdateServiceEndpoints(input, []string{
"quic://10.0.0.7:19443",
"quic://10.0.0.5:19443",
})
if len(ordered) != 3 {
t.Fatalf("ordered len = %d", len(ordered))
}
if ordered[0].Address != "quic://10.0.0.7:19443" || ordered[1].Address != "quic://10.0.0.5:19443" {
t.Fatalf("unexpected preferred ordering: %+v", ordered)
}
}
+221 -28
View File
@@ -2,6 +2,7 @@ package hostagent
import (
"context"
"encoding/json"
"errors"
"fmt"
"os"
@@ -47,6 +48,7 @@ type LinuxInstallResult struct {
NodeAgentPath string
HostAgentPath string
EnvPath string
UpdaterEnvPath string
UnitName string
UnitPath string
UpdaterUnitName string
@@ -64,13 +66,14 @@ func LinuxInstallConfigFromProfile(profile LinuxInstallProfile) LinuxInstallConf
installDir := firstNonEmpty(profile.InstallDir, filepath.Join(DefaultLinuxInstallRoot, safeUnitSlug(profile.NodeName)))
return LinuxInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: profile.BackendURL,
ClusterAuthorityPublicKey: strings.TrimSpace(profile.ClusterAuthorityPublicKey),
FabricRegistryRecordsJSON: strings.TrimSpace(string(profile.FabricRegistryRecords)),
ClusterID: profile.ClusterID,
JoinToken: profile.JoinToken,
NodeName: profile.NodeName,
StateDir: stateDir,
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
FabricRuntimeEnabled: profile.FabricRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled,
MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled,
@@ -78,15 +81,18 @@ func LinuxInstallConfigFromProfile(profile LinuxInstallProfile) LinuxInstallConf
VPNFabricSessionStreamShards: profile.VPNFabricSessionStreamShards,
VPNFabricQUICMaxStreamsPerConn: profile.VPNFabricQUICMaxStreamsPerConn,
VPNFabricQUICIdleTTLSeconds: profile.VPNFabricQUICIdleTTLSeconds,
MeshListenAddr: profile.MeshListenAddr,
MeshListenPortMode: profile.MeshListenPortMode,
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
FabricListenAddr: profile.FabricListenAddr,
FabricListenPortMode: profile.FabricListenPortMode,
FabricListenAutoPortStart: profile.FabricListenAutoPortStart,
FabricListenAutoPortEnd: profile.FabricListenAutoPortEnd,
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
MeshConnectivityMode: profile.MeshConnectivityMode,
MeshNATType: profile.MeshNATType,
MeshSiteID: profile.MeshSiteID,
MeshLocalityGroupID: firstNonEmpty(profile.MeshLocalityGroupID, profile.MeshSiteID),
MeshNATGroupID: profile.MeshNATGroupID,
MeshRegion: profile.MeshRegion,
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
@@ -152,15 +158,16 @@ func (m LinuxManager) Install(ctx context.Context, cfg LinuxInstallConfig) (Linu
cfg.StartupMode = strings.ToLower(firstNonEmpty(cfg.StartupMode, "systemd"))
unitName := "rap-node-agent-" + slug + ".service"
result := LinuxInstallResult{
NodeName: cfg.RuntimeConfig.NodeName,
InstallDir: cfg.InstallDir,
StateDir: cfg.StateDir,
ConfigDir: cfg.ConfigDir,
NodeAgentPath: filepath.Join(cfg.InstallDir, "rap-node-agent"),
HostAgentPath: filepath.Join(cfg.InstallDir, "rap-host-agent"),
EnvPath: filepath.Join(cfg.ConfigDir, "rap-node-agent.env"),
UnitName: unitName,
UnitPath: filepath.Join(cfg.UnitDir, unitName),
NodeName: cfg.RuntimeConfig.NodeName,
InstallDir: cfg.InstallDir,
StateDir: cfg.StateDir,
ConfigDir: cfg.ConfigDir,
NodeAgentPath: filepath.Join(cfg.InstallDir, "rap-node-agent"),
HostAgentPath: filepath.Join(cfg.InstallDir, "rap-host-agent"),
EnvPath: filepath.Join(cfg.ConfigDir, "rap-node-agent.env"),
UpdaterEnvPath: filepath.Join(cfg.ConfigDir, "rap-host-agent-updater.env"),
UnitName: unitName,
UnitPath: filepath.Join(cfg.UnitDir, unitName),
}
if cfg.DryRun {
return result, nil
@@ -273,7 +280,7 @@ func installLinuxHostAgentUpdater(ctx context.Context, m LinuxManager, result Li
}
interval := cfg.AutoUpdateIntervalSeconds
if interval == 0 {
interval = 21600
interval = DefaultUpdateIntervalSec
}
initialDelay := cfg.AutoUpdateInitialDelaySeconds
if initialDelay == 0 {
@@ -301,16 +308,16 @@ func installLinuxHostAgentUpdater(ctx context.Context, m LinuxManager, result Li
"--host-agent-current-version", firstNonEmpty(cfg.AutoUpdateCurrentVersion, "0.0.0"),
"--host-agent-binary-path", result.HostAgentPath,
}
if strings.TrimSpace(cfg.RuntimeConfig.BackendURL) != "" {
args = append(args, "--backend-url", strings.TrimSpace(cfg.RuntimeConfig.BackendURL))
}
args = appendFabricUpdateArgs(args, cfg.RuntimeConfig)
args = appendFabricUpdateArgs(args, cfg.RuntimeConfig, false)
if strings.TrimSpace(cfg.NodeID) != "" {
args = append(args, "--node-id", strings.TrimSpace(cfg.NodeID))
}
if strings.TrimSpace(cfg.AutoUpdateChannel) != "" {
args = append(args, "--channel", strings.TrimSpace(cfg.AutoUpdateChannel))
}
if err := os.WriteFile(result.UpdaterEnvPath, []byte(linuxHostAgentUpdaterEnvFile(cfg.RuntimeConfig)), 0o600); err != nil {
return result, err
}
unitName := "rap-host-agent-updater-" + safeUnitSlug(result.NodeName) + ".service"
unitPath := filepath.Join(firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir), unitName)
unit := fmt.Sprintf(`[Unit]
@@ -320,13 +327,14 @@ Wants=network-online.target
[Service]
Type=simple
EnvironmentFile=%s
ExecStart=%s
Restart=always
RestartSec=30
[Install]
WantedBy=multi-user.target
`, result.NodeName, result.UnitName, systemdJoin(args))
`, result.NodeName, result.UnitName, systemdQuote(result.UpdaterEnvPath), systemdJoin(args))
if err := os.WriteFile(unitPath, []byte(unit), 0o644); err != nil {
return result, err
}
@@ -359,12 +367,22 @@ func (m LinuxManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Updat
result := UpdateResult{Action: plan.Action, Reason: plan.Reason, TargetVersion: plan.TargetVersion, ContainerName: req.SystemdUnitName, NewImage: req.BinaryPath}
if plan.Action != "update" {
if !req.DryRun {
restarted, err := rewriteLinuxControlPlaneRuntime(ctx, m.runner(), req, plan)
if err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "rewrite_runtime", "failed", err))
return result, err
}
result.RestartNeeded = restarted
}
if !req.DryRun {
_ = saveUpdatePlanState(req, plan, req.CurrentVersion, req.SystemdUnitName, req.BinaryPath)
status := statusFromNoopPlan(req, plan)
if status.Payload == nil {
status.Payload = map[string]any{}
}
status.Payload["systemd_unit"] = req.SystemdUnitName
status.Payload["binary_path"] = req.BinaryPath
status.Payload["restart_needed"] = result.RestartNeeded
_ = ReportNodeUpdateStatusForRequest(ctx, req, status)
}
return result, nil
@@ -387,14 +405,14 @@ func (m LinuxManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Updat
if req.DryRun {
return result, nil
}
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "download", Status: "started", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath}})
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "download", Status: "started", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"artifact_id": plan.Artifact.ID, "binary_path": req.BinaryPath, "transport": updateArtifactTransport(req, plan)}})
path, distributors, err := downloadUpdateArtifact(ctx, req, plan)
if err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "download", "failed", err))
return result, err
}
defer os.Remove(path)
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "download", Status: "succeeded", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"artifact_id": plan.Artifact.ID, "binary_path": req.BinaryPath, "fabric_distributors": distributors, "transport": updateArtifactTransport(req, plan)}})
runner := m.runner()
_, _ = runner.Run(ctx, "systemctl", "stop", req.SystemdUnitName)
if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
@@ -402,15 +420,183 @@ func (m LinuxManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Updat
return result, err
}
result.Replaced = true
if _, err := runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName); err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "restart", "failed", err))
restartedByRewrite, err := rewriteLinuxControlPlaneRuntime(ctx, runner, req, plan)
if err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "rewrite_runtime", "failed", err))
return result, err
}
result.RestartNeeded = restartedByRewrite
if !restartedByRewrite {
if _, err := runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName); err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "restart", "failed", err))
return result, err
}
}
if err := ensureLinuxUnitActive(ctx, runner, req.SystemdUnitName); err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "health_check", "failed", err))
return result, err
}
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "health_check", Status: "succeeded", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"systemd_unit": req.SystemdUnitName, "binary_path": req.BinaryPath}})
_ = saveUpdateState(req.StateDir, UpdateState{Product: req.Product, CurrentVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion, Image: req.BinaryPath, UpdatedAt: time.Now().UTC()})
_ = saveUpdatePlanState(req, plan, plan.TargetVersion, req.SystemdUnitName, req.BinaryPath)
return result, nil
}
func linuxHostAgentUpdaterEnvFile(cfg RuntimeConfig) string {
lines := []string{}
if registry := strings.TrimSpace(cfg.FabricRegistryRecordsJSON); registry != "" {
lines = append(lines, "RAP_FABRIC_REGISTRY_RECORDS_JSON="+systemdQuote(registry))
}
if len(lines) == 0 {
return ""
}
return strings.Join(lines, "\n") + "\n"
}
func ensureLinuxUnitActive(ctx context.Context, runner CommandRunner, unitName string) error {
unitName = strings.TrimSpace(unitName)
if unitName == "" {
return nil
}
out, err := runner.Run(ctx, "systemctl", "is-active", unitName)
if err != nil {
return err
}
if strings.TrimSpace(out) != "active" {
return fmt.Errorf("systemd unit %s is not active: %s", unitName, strings.TrimSpace(out))
}
return nil
}
func rewriteLinuxControlPlaneRuntime(ctx context.Context, runner CommandRunner, req UpdateRequest, plan NodeUpdatePlan) (bool, error) {
_ = saveControlPlaneRuntimeState(req.StateDir, ControlPlaneRuntimeState{
SchemaVersion: "rap.control_plane_runtime_state.v1",
ClusterID: strings.TrimSpace(plan.ClusterID),
NodeID: strings.TrimSpace(plan.NodeID),
Product: strings.TrimSpace(plan.Product),
FabricRegistryRecords: append(json.RawMessage(nil), plan.FabricRegistryRecords...),
AuthorityPayload: append(json.RawMessage(nil), plan.AuthorityPayload...),
AuthoritySignature: append(json.RawMessage(nil), plan.AuthoritySignature...),
AuthorityQuorum: plan.AuthorityQuorum,
UpdatedAt: time.Now().UTC(),
})
slug := strings.TrimSuffix(strings.TrimSpace(req.SystemdUnitName), ".service")
slug = strings.TrimPrefix(slug, "rap-node-agent-")
if slug == "" {
return false, nil
}
envChanged := false
envPath := filepath.Join(DefaultLinuxConfigRoot, slug, "rap-node-agent.env")
wantRegistry := strings.TrimSpace(string(plan.FabricRegistryRecords))
if wantRegistry != "" && fileExists(envPath) {
current, err := os.ReadFile(envPath)
if err != nil {
return false, err
}
updatedEnv := string(current)
updatedEnv = upsertEnvFileValue(updatedEnv, "RAP_FABRIC_REGISTRY_RECORDS_JSON", wantRegistry)
if updatedEnv != string(current) {
if err := os.WriteFile(envPath, []byte(updatedEnv), 0o600); err != nil {
return false, err
}
envChanged = true
}
}
updaterUnitName := "rap-host-agent-updater-" + safeUnitSlug(slug) + ".service"
updaterUnitPath := filepath.Join(DefaultSystemdUnitDir, updaterUnitName)
updaterEnvPath := filepath.Join(DefaultLinuxConfigRoot, slug, "rap-host-agent-updater.env")
if wantRegistry != "" {
current := ""
if fileExists(updaterEnvPath) {
payload, err := os.ReadFile(updaterEnvPath)
if err != nil {
return false, err
}
current = string(payload)
}
updatedEnv := upsertEnvFileValue(current, "RAP_FABRIC_REGISTRY_RECORDS_JSON", wantRegistry)
if updatedEnv != current {
if err := os.MkdirAll(filepath.Dir(updaterEnvPath), 0o755); err != nil {
return false, err
}
if err := os.WriteFile(updaterEnvPath, []byte(updatedEnv), 0o600); err != nil {
return false, err
}
envChanged = true
}
}
if wantRegistry == "" {
if envChanged && strings.TrimSpace(req.SystemdUnitName) != "" {
_, _ = runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName)
}
return envChanged, nil
}
if !fileExists(updaterUnitPath) {
if envChanged && strings.TrimSpace(req.SystemdUnitName) != "" {
_, _ = runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName)
}
return envChanged, nil
}
current, err := os.ReadFile(updaterUnitPath)
if err != nil {
return false, err
}
updated := ensureSystemdEnvironmentFile(replaceCLIArg(string(current), "--fabric-registry-records-json", "", false), updaterEnvPath)
if updated == string(current) {
if envChanged && strings.TrimSpace(req.SystemdUnitName) != "" {
_, _ = runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName)
}
return envChanged, nil
}
if err := os.WriteFile(updaterUnitPath, []byte(updated), 0o644); err != nil {
return false, err
}
if _, err := runner.Run(ctx, "systemctl", "daemon-reload"); err != nil {
return false, err
}
_, _ = runner.Run(ctx, "systemctl", "restart", updaterUnitName)
if envChanged && strings.TrimSpace(req.SystemdUnitName) != "" {
_, _ = runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName)
}
return true, nil
}
func ensureSystemdEnvironmentFile(unit string, envPath string) string {
envPath = strings.TrimSpace(envPath)
if envPath == "" || strings.Contains(unit, "EnvironmentFile=") {
return unit
}
line := "EnvironmentFile=" + systemdQuote(envPath)
if strings.Contains(unit, "Type=simple\n") {
return strings.Replace(unit, "Type=simple\n", "Type=simple\n"+line+"\n", 1)
}
if strings.Contains(unit, "[Service]\n") {
return strings.Replace(unit, "[Service]\n", "[Service]\n"+line+"\n", 1)
}
return unit
}
func upsertEnvFileValue(payload string, key string, value string) string {
prefix := key + "="
lines := strings.Split(payload, "\n")
for i, line := range lines {
rawLine := strings.TrimRight(line, "\r")
trimmed := strings.TrimSpace(rawLine)
if strings.HasPrefix(trimmed, prefix) {
if value == "" {
lines = append(lines[:i], lines[i+1:]...)
} else {
lines[i] = prefix + systemdQuote(value)
}
return strings.Join(lines, "\n")
}
}
if value == "" {
return payload
}
lines = append(lines, prefix+systemdQuote(value))
return strings.Join(lines, "\n")
}
func (m LinuxManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) error {
req := cfg.Request
req.InstallType = firstNonEmpty(req.InstallType, BinaryUpdateInstallType)
@@ -421,6 +607,9 @@ func (m LinuxManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) e
}
func runLinuxUpdateLoop(ctx context.Context, m LinuxManager, cfg UpdateLoopConfig) error {
if err := ReconcileSignedUpdateState(cfg.Request.StateDir); err != nil {
return err
}
if cfg.Interval == 0 {
cfg.Interval = time.Hour
}
@@ -450,6 +639,7 @@ func runLinuxUpdateLoop(ctx context.Context, m LinuxManager, cfg UpdateLoopConfi
continue
} else {
logf("linux_update_loop run=%d status=failed error=%v", runs, err)
saveUpdateLoopRescueState(cfg.Request, "linux_node_agent_update_failed", err)
if cfg.StopOnError {
return err
}
@@ -462,10 +652,12 @@ func runLinuxUpdateLoop(ctx context.Context, m LinuxManager, cfg UpdateLoopConfi
}
if cfg.HostAgentUpdateEnabled {
hostReq := cfg.HostAgentUpdateRequest
hostReq.BackendURL = firstNonEmpty(hostReq.BackendURL, cfg.Request.BackendURL)
hostReq.ClusterID = firstNonEmpty(hostReq.ClusterID, cfg.Request.ClusterID)
hostReq.NodeID = firstNonEmpty(hostReq.NodeID, cfg.Request.NodeID)
hostReq.StateDir = firstNonEmpty(hostReq.StateDir, cfg.Request.StateDir)
hostReq.ClusterAuthorityPublicKey = firstNonEmpty(hostReq.ClusterAuthorityPublicKey, cfg.Request.ClusterAuthorityPublicKey)
hostReq.FabricRegistryRecordsJSON = firstNonEmpty(hostReq.FabricRegistryRecordsJSON, cfg.Request.FabricRegistryRecordsJSON)
hostReq.MeshRegion = firstNonEmpty(hostReq.MeshRegion, cfg.Request.MeshRegion)
hostReq.Channel = firstNonEmpty(hostReq.Channel, cfg.Request.Channel)
hostReq.OS = firstNonEmpty(hostReq.OS, "linux")
hostReq.Arch = firstNonEmpty(hostReq.Arch, runtime.GOARCH)
@@ -473,6 +665,7 @@ func runLinuxUpdateLoop(ctx context.Context, m LinuxManager, cfg UpdateLoopConfi
hostResult, hostErr := (DockerManager{}).ApplyHostAgentUpdate(ctx, hostReq)
if hostErr != nil {
logf("linux_host_agent_update_loop run=%d status=failed error=%v", runs, hostErr)
saveUpdateLoopRescueState(cfg.Request, "linux_host_agent_update_failed", hostErr)
} else {
logf("linux_host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t", runs, hostResult.Action, hostResult.Reason, hostResult.TargetVersion, hostResult.NewImage, hostResult.Replaced, hostResult.RestartNeeded)
if hostResult.Action == "update" && hostResult.TargetVersion != "" && !hostResult.RolledBack {
@@ -31,7 +31,6 @@ const (
)
type MonitorConfig struct {
BackendURL string
ClusterID string
NodeID string
StateDir string
@@ -198,7 +197,6 @@ func RunMonitorOnce(ctx context.Context, cfg MonitorConfig) MonitorResult {
}
func normalizeMonitorConfig(cfg MonitorConfig) MonitorConfig {
cfg.BackendURL = strings.TrimRight(strings.TrimSpace(cfg.BackendURL), "/")
cfg.ClusterID = strings.TrimSpace(cfg.ClusterID)
cfg.NodeID = strings.TrimSpace(cfg.NodeID)
cfg.StateDir = strings.TrimSpace(cfg.StateDir)
@@ -398,7 +396,7 @@ func reportMonitorStatus(ctx context.Context, cfg MonitorConfig, result MonitorR
}
return err
}
if cfg.BackendURL == "" || clusterID == "" || nodeID == "" {
if strings.TrimSpace(cfg.FabricRegistryRecordsJSON) == "" || strings.TrimSpace(cfg.ClusterAuthorityPublicKey) == "" || clusterID == "" || nodeID == "" {
return nil
}
payload := map[string]any{
@@ -425,7 +423,6 @@ func reportMonitorStatus(ctx context.Context, cfg MonitorConfig, result MonitorR
req.ErrorMessage = &errText
}
return ReportNodeUpdateStatusForRequest(ctx, UpdateRequest{
BackendURL: cfg.BackendURL,
ClusterID: clusterID,
NodeID: nodeID,
StateDir: cfg.StateDir,
@@ -2,19 +2,39 @@ package hostagent
import (
"bytes"
"context"
"crypto/ed25519"
"encoding/base64"
"encoding/json"
"fmt"
"net/http"
"os"
"path/filepath"
"strings"
"time"
clusterauth "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
)
func trimProfileEndpointSlice(items []string) []string {
out := make([]string, 0, len(items))
seen := map[string]struct{}{}
for _, item := range items {
trimmed := strings.TrimRight(strings.TrimSpace(item), "/")
if trimmed == "" {
continue
}
if _, ok := seen[trimmed]; ok {
continue
}
seen[trimmed] = struct{}{}
out = append(out, trimmed)
}
return out
}
type DockerInstallProfile struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ClusterAuthorityPublicKey string `json:"cluster_authority_public_key"`
ArtifactEndpoints []string `json:"artifact_endpoints"`
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records"`
DockerImageArtifact *DockerArtifact `json:"docker_image_artifact"`
@@ -29,7 +49,7 @@ type DockerInstallProfile struct {
Replace bool `json:"replace"`
DockerVPNGatewayEnabled bool `json:"docker_vpn_gateway_enabled"`
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
FabricRuntimeEnabled bool `json:"fabric_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"`
MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"`
@@ -37,15 +57,18 @@ type DockerInstallProfile struct {
VPNFabricSessionStreamShards int `json:"vpn_fabric_session_stream_shards"`
VPNFabricQUICMaxStreamsPerConn int `json:"vpn_fabric_quic_max_streams_per_conn"`
VPNFabricQUICIdleTTLSeconds int `json:"vpn_fabric_quic_idle_ttl_seconds"`
MeshListenAddr string `json:"mesh_listen_addr"`
MeshListenPortMode string `json:"mesh_listen_port_mode"`
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
FabricListenAddr string `json:"fabric_listen_addr"`
FabricListenPortMode string `json:"fabric_listen_port_mode"`
FabricListenAutoPortStart int `json:"fabric_listen_auto_port_start"`
FabricListenAutoPortEnd int `json:"fabric_listen_auto_port_end"`
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
MeshNATType string `json:"mesh_nat_type"`
MeshSiteID string `json:"mesh_site_id"`
MeshLocalityGroupID string `json:"mesh_locality_group_id"`
MeshNATGroupID string `json:"mesh_nat_group_id"`
MeshRegion string `json:"mesh_region"`
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
@@ -67,8 +90,7 @@ type DockerArtifact struct {
type WindowsInstallProfile struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ClusterAuthorityPublicKey string `json:"cluster_authority_public_key"`
ArtifactEndpoints []string `json:"artifact_endpoints"`
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records"`
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
@@ -78,7 +100,7 @@ type WindowsInstallProfile struct {
InstallDir string `json:"install_dir"`
StartupMode string `json:"startup_mode"`
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
FabricRuntimeEnabled bool `json:"fabric_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"`
MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"`
@@ -86,15 +108,18 @@ type WindowsInstallProfile struct {
VPNFabricSessionStreamShards int `json:"vpn_fabric_session_stream_shards"`
VPNFabricQUICMaxStreamsPerConn int `json:"vpn_fabric_quic_max_streams_per_conn"`
VPNFabricQUICIdleTTLSeconds int `json:"vpn_fabric_quic_idle_ttl_seconds"`
MeshListenAddr string `json:"mesh_listen_addr"`
MeshListenPortMode string `json:"mesh_listen_port_mode"`
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
FabricListenAddr string `json:"fabric_listen_addr"`
FabricListenPortMode string `json:"fabric_listen_port_mode"`
FabricListenAutoPortStart int `json:"fabric_listen_auto_port_start"`
FabricListenAutoPortEnd int `json:"fabric_listen_auto_port_end"`
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
MeshNATType string `json:"mesh_nat_type"`
MeshSiteID string `json:"mesh_site_id"`
MeshLocalityGroupID string `json:"mesh_locality_group_id"`
MeshNATGroupID string `json:"mesh_nat_group_id"`
MeshRegion string `json:"mesh_region"`
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
@@ -106,8 +131,7 @@ type WindowsInstallProfile struct {
type LinuxInstallProfile struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ClusterAuthorityPublicKey string `json:"cluster_authority_public_key"`
ArtifactEndpoints []string `json:"artifact_endpoints"`
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records"`
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
@@ -117,7 +141,7 @@ type LinuxInstallProfile struct {
InstallDir string `json:"install_dir"`
StartupMode string `json:"startup_mode"`
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
FabricRuntimeEnabled bool `json:"fabric_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"`
MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"`
@@ -125,15 +149,18 @@ type LinuxInstallProfile struct {
VPNFabricSessionStreamShards int `json:"vpn_fabric_session_stream_shards"`
VPNFabricQUICMaxStreamsPerConn int `json:"vpn_fabric_quic_max_streams_per_conn"`
VPNFabricQUICIdleTTLSeconds int `json:"vpn_fabric_quic_idle_ttl_seconds"`
MeshListenAddr string `json:"mesh_listen_addr"`
MeshListenPortMode string `json:"mesh_listen_port_mode"`
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
FabricListenAddr string `json:"fabric_listen_addr"`
FabricListenPortMode string `json:"fabric_listen_port_mode"`
FabricListenAutoPortStart int `json:"fabric_listen_auto_port_start"`
FabricListenAutoPortEnd int `json:"fabric_listen_auto_port_end"`
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
MeshNATType string `json:"mesh_nat_type"`
MeshSiteID string `json:"mesh_site_id"`
MeshLocalityGroupID string `json:"mesh_locality_group_id"`
MeshNATGroupID string `json:"mesh_nat_group_id"`
MeshRegion string `json:"mesh_region"`
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
@@ -143,152 +170,188 @@ type LinuxInstallProfile struct {
}
type ProfileRequest struct {
URL string
ClusterID string
InstallToken string
NodeName string
HTTPClient *http.Client
ClusterID string
NodeName string
}
func FetchDockerInstallProfile(ctx context.Context, req ProfileRequest) (DockerInstallProfile, error) {
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
return DockerInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
}
if !strings.HasSuffix(url, "/node-agents/docker-install-profile") {
url += "/node-agents/docker-install-profile"
}
body, err := json.Marshal(map[string]string{
"cluster_id": strings.TrimSpace(req.ClusterID),
"install_token": strings.TrimSpace(req.InstallToken),
"node_name": strings.TrimSpace(req.NodeName),
})
if err != nil {
return DockerInstallProfile{}, err
}
httpClient := req.HTTPClient
if httpClient == nil {
httpClient = &http.Client{Timeout: 20 * time.Second}
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
return DockerInstallProfile{}, err
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := httpClient.Do(httpReq)
if err != nil {
return DockerInstallProfile{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return DockerInstallProfile{}, fmt.Errorf("fetch docker install profile: %s", resp.Status)
}
var envelope struct {
Profile DockerInstallProfile `json:"docker_install_profile"`
}
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
return DockerInstallProfile{}, err
}
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
}
return envelope.Profile, nil
type JoinBundle struct {
DockerInstallProfile *DockerInstallProfile `json:"docker_install_profile,omitempty"`
WindowsInstallProfile *WindowsInstallProfile `json:"windows_install_profile,omitempty"`
LinuxInstallProfile *LinuxInstallProfile `json:"linux_install_profile,omitempty"`
}
func FetchWindowsInstallProfile(ctx context.Context, req ProfileRequest) (WindowsInstallProfile, error) {
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
return WindowsInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
}
if !strings.HasSuffix(url, "/node-agents/windows-install-profile") {
url += "/node-agents/windows-install-profile"
}
body, err := json.Marshal(map[string]string{
"cluster_id": strings.TrimSpace(req.ClusterID),
"install_token": strings.TrimSpace(req.InstallToken),
"node_name": strings.TrimSpace(req.NodeName),
})
if err != nil {
return WindowsInstallProfile{}, err
}
httpClient := req.HTTPClient
if httpClient == nil {
httpClient = &http.Client{Timeout: 20 * time.Second}
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
return WindowsInstallProfile{}, err
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := httpClient.Do(httpReq)
if err != nil {
return WindowsInstallProfile{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return WindowsInstallProfile{}, fmt.Errorf("fetch windows install profile: %s", resp.Status)
}
var envelope struct {
Profile WindowsInstallProfile `json:"windows_install_profile"`
}
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
return WindowsInstallProfile{}, err
}
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
}
return envelope.Profile, nil
type ClusterAuthorityDescriptor struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
AuthorityState string `json:"authority_state"`
KeyAlgorithm string `json:"key_algorithm"`
PublicKey string `json:"public_key"`
PublicKeyFingerprint string `json:"public_key_fingerprint"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
func FetchLinuxInstallProfile(ctx context.Context, req ProfileRequest) (LinuxInstallProfile, error) {
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
return LinuxInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
type ClusterSignature struct {
SchemaVersion string `json:"schema_version"`
Algorithm string `json:"algorithm"`
KeyFingerprint string `json:"key_fingerprint"`
Signature string `json:"signature"`
SignedAt time.Time `json:"signed_at"`
}
type joinBundleEnvelope struct {
SchemaVersion string `json:"schema_version,omitempty"`
BundleKind string `json:"bundle_kind,omitempty"`
ClusterID string `json:"cluster_id,omitempty"`
ClusterAuthority *ClusterAuthorityDescriptor `json:"cluster_authority,omitempty"`
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"`
}
type joinBundleProfileIdentity struct {
ClusterID string `json:"cluster_id"`
ClusterAuthorityPublicKey string `json:"cluster_authority_public_key"`
}
func LoadDockerJoinBundle(path string) (DockerInstallProfile, error) {
var profile DockerInstallProfile
if err := loadJoinBundleProfile(path, "docker_install_profile", &profile); err != nil {
return DockerInstallProfile{}, err
}
if !strings.HasSuffix(url, "/node-agents/linux-install-profile") {
url += "/node-agents/linux-install-profile"
return profile, nil
}
func LoadWindowsJoinBundle(path string) (WindowsInstallProfile, error) {
var profile WindowsInstallProfile
if err := loadJoinBundleProfile(path, "windows_install_profile", &profile); err != nil {
return WindowsInstallProfile{}, err
}
body, err := json.Marshal(map[string]string{
"cluster_id": strings.TrimSpace(req.ClusterID),
"install_token": strings.TrimSpace(req.InstallToken),
"node_name": strings.TrimSpace(req.NodeName),
})
return profile, nil
}
func LoadLinuxJoinBundle(path string) (LinuxInstallProfile, error) {
var profile LinuxInstallProfile
if err := loadJoinBundleProfile(path, "linux_install_profile", &profile); err != nil {
return LinuxInstallProfile{}, err
}
return profile, nil
}
func SaveJoinBundle(path string, raw []byte) error {
path = strings.TrimSpace(path)
if path == "" {
return fmt.Errorf("join-bundle path is required")
}
if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil {
return err
}
return os.WriteFile(path, raw, 0o600)
}
func loadJoinBundleProfile(path, key string, target any) error {
path = strings.TrimSpace(path)
if path == "" {
return fmt.Errorf("join-bundle is required")
}
payload, err := os.ReadFile(path)
if err != nil {
return LinuxInstallProfile{}, err
return err
}
httpClient := req.HTTPClient
if httpClient == nil {
httpClient = &http.Client{Timeout: 20 * time.Second}
_, err = parseJoinBundleProfileBytes(payload, key, target)
return err
}
func parseJoinBundleProfileBytes(payload []byte, key string, target any) ([]byte, error) {
var envelopeMap map[string]json.RawMessage
if err := json.Unmarshal(payload, &envelopeMap); err == nil {
profileRaw := envelopeMap[key]
if len(bytes.TrimSpace(profileRaw)) > 0 {
if err := verifyJoinBundleEnvelope(payload, key, profileRaw); err != nil {
return nil, err
}
if err := json.Unmarshal(profileRaw, target); err != nil {
return nil, err
}
return profileRaw, nil
}
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
return nil, fmt.Errorf("join bundle envelope is missing signed install profile payload")
}
func verifyJoinBundleEnvelope(payload []byte, profileKey string, profileRaw json.RawMessage) error {
var envelope joinBundleEnvelope
if err := json.Unmarshal(payload, &envelope); err != nil {
return fmt.Errorf("decode join bundle envelope: %w", err)
}
if envelope.ClusterAuthority == nil && len(bytes.TrimSpace(envelope.AuthorityPayload)) == 0 && envelope.AuthoritySignature == nil {
return fmt.Errorf("join bundle authority envelope is missing")
}
if envelope.ClusterAuthority == nil || len(bytes.TrimSpace(envelope.AuthorityPayload)) == 0 || envelope.AuthoritySignature == nil {
return fmt.Errorf("join bundle authority envelope is incomplete")
}
envelopeClusterID := strings.TrimSpace(envelope.ClusterID)
authorityClusterID := strings.TrimSpace(envelope.ClusterAuthority.ClusterID)
if envelopeClusterID == "" || authorityClusterID == "" || envelopeClusterID != authorityClusterID {
return fmt.Errorf("join bundle cluster identity is inconsistent")
}
signature := clusterauth.Signature{
SchemaVersion: envelope.AuthoritySignature.SchemaVersion,
Algorithm: envelope.AuthoritySignature.Algorithm,
KeyFingerprint: envelope.AuthoritySignature.KeyFingerprint,
Signature: envelope.AuthoritySignature.Signature,
}
if err := clusterauth.VerifyRaw(envelope.ClusterAuthority.PublicKey, envelope.AuthorityPayload, signature); err != nil {
return fmt.Errorf("verify join bundle authority signature: %w", err)
}
var signedProfiles map[string]json.RawMessage
if err := json.Unmarshal(envelope.AuthorityPayload, &signedProfiles); err != nil {
return fmt.Errorf("decode join bundle authority payload: %w", err)
}
signedProfileRaw := signedProfiles[profileKey]
if len(bytes.TrimSpace(signedProfileRaw)) == 0 {
return fmt.Errorf("join bundle authority payload missing %s", profileKey)
}
want, err := clusterauth.CanonicalJSON(signedProfileRaw)
if err != nil {
return LinuxInstallProfile{}, err
return err
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := httpClient.Do(httpReq)
got, err := clusterauth.CanonicalJSON(profileRaw)
if err != nil {
return LinuxInstallProfile{}, err
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return LinuxInstallProfile{}, fmt.Errorf("fetch linux install profile: %s", resp.Status)
if !bytes.Equal(want, got) {
return fmt.Errorf("join bundle profile does not match signed authority payload")
}
var envelope struct {
Profile LinuxInstallProfile `json:"linux_install_profile"`
authorityPublicKey := strings.TrimSpace(envelope.ClusterAuthority.PublicKey)
if authorityPublicKey == "" {
return fmt.Errorf("join bundle authority public key is empty")
}
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
return LinuxInstallProfile{}, err
if fingerprint := strings.TrimSpace(envelope.ClusterAuthority.PublicKeyFingerprint); fingerprint != "" {
publicKey, err := base64.StdEncoding.DecodeString(authorityPublicKey)
if err != nil || len(publicKey) != ed25519.PublicKeySize {
return fmt.Errorf("join bundle authority public key is invalid")
}
if fingerprint != clusterauth.Fingerprint(ed25519.PublicKey(publicKey)) {
return fmt.Errorf("join bundle authority fingerprint does not match authority public key")
}
}
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
var identity joinBundleProfileIdentity
if err := json.Unmarshal(profileRaw, &identity); err != nil {
return fmt.Errorf("decode join bundle profile identity: %w", err)
}
return envelope.Profile, nil
if strings.TrimSpace(identity.ClusterID) == "" || strings.TrimSpace(identity.ClusterID) != envelopeClusterID {
return fmt.Errorf("join bundle profile cluster_id does not match signed bundle cluster_id")
}
if strings.TrimSpace(identity.ClusterAuthorityPublicKey) == "" || strings.TrimSpace(identity.ClusterAuthorityPublicKey) != authorityPublicKey {
return fmt.Errorf("join bundle profile authority key does not match signed bundle authority key")
}
return nil
}
func RuntimeConfigFromProfile(profile DockerInstallProfile) RuntimeConfig {
return RuntimeConfig{
BackendURL: profile.BackendURL,
ClusterID: profile.ClusterID,
ClusterAuthorityPublicKey: strings.TrimSpace(profile.ClusterAuthorityPublicKey),
JoinToken: profile.JoinToken,
NodeName: profile.NodeName,
Image: profile.Image,
@@ -300,7 +363,7 @@ func RuntimeConfigFromProfile(profile DockerInstallProfile) RuntimeConfig {
Replace: profile.Replace,
DockerVPNGatewayEnabled: profile.DockerVPNGatewayEnabled,
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
FabricRuntimeEnabled: profile.FabricRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled,
MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled,
@@ -308,16 +371,19 @@ func RuntimeConfigFromProfile(profile DockerInstallProfile) RuntimeConfig {
VPNFabricSessionStreamShards: profile.VPNFabricSessionStreamShards,
VPNFabricQUICMaxStreamsPerConn: profile.VPNFabricQUICMaxStreamsPerConn,
VPNFabricQUICIdleTTLSeconds: profile.VPNFabricQUICIdleTTLSeconds,
MeshListenAddr: profile.MeshListenAddr,
MeshListenPortMode: profile.MeshListenPortMode,
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
FabricListenAddr: profile.FabricListenAddr,
FabricListenPortMode: profile.FabricListenPortMode,
FabricListenAutoPortStart: profile.FabricListenAutoPortStart,
FabricListenAutoPortEnd: profile.FabricListenAutoPortEnd,
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
FabricRegistryRecordsJSON: string(profile.FabricRegistryRecords),
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
MeshConnectivityMode: profile.MeshConnectivityMode,
MeshNATType: profile.MeshNATType,
MeshSiteID: profile.MeshSiteID,
MeshLocalityGroupID: firstNonEmpty(profile.MeshLocalityGroupID, profile.MeshSiteID),
MeshNATGroupID: profile.MeshNATGroupID,
MeshRegion: profile.MeshRegion,
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
@@ -2,6 +2,7 @@ package hostagent
import (
"context"
"encoding/json"
"errors"
"fmt"
"os"
@@ -10,7 +11,6 @@ import (
)
type HostAgentUpdateRequest struct {
BackendURL string
ClusterID string
NodeID string
StateDir string
@@ -40,7 +40,6 @@ type HostAgentUpdateLoopConfig struct {
func (req HostAgentUpdateRequest) updateRequest() UpdateRequest {
return UpdateRequest{
BackendURL: req.BackendURL,
ClusterID: req.ClusterID,
NodeID: req.NodeID,
StateDir: req.StateDir,
@@ -79,6 +78,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
}
if plan.Action != "update" {
if !req.DryRun {
_ = saveUpdatePlanState(resolved, plan, resolved.CurrentVersion, "host-agent-service", binaryPath)
status := statusFromNoopPlan(resolved, plan)
status.Product = HostAgentUpdateProduct
if status.Payload == nil {
@@ -102,7 +102,6 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
if req.DryRun {
return result, nil
}
urls := artifactURLsForBackend(*plan.Artifact, resolved.BackendURL)
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion,
@@ -111,14 +110,24 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
Status: "started",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": binaryPath},
Payload: map[string]any{"artifact_id": plan.Artifact.ID, "binary_path": binaryPath, "transport": updateArtifactTransport(resolved, plan)},
})
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
path, distributors, err := downloadUpdateArtifact(ctx, resolved, plan)
if err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, statusFromError(resolved, plan, "download", "failed", err))
return result, err
}
defer os.Remove(path)
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "download",
Status: "succeeded",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"artifact_id": plan.Artifact.ID, "binary_path": binaryPath, "fabric_distributors": distributors, "transport": updateArtifactTransport(resolved, plan)},
})
if err := installHostAgentBinary(path, binaryPath); err != nil {
stageErr := stageHostAgentBinary(path, binaryPath)
if stageErr == nil {
@@ -129,7 +138,24 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
TargetVersion: plan.TargetVersion,
ContainerName: "host-agent-service",
Image: binaryPath,
UpdatedAt: time.Now().UTC(),
PlanAction: plan.Action,
PlanReason: plan.Reason,
UpdateIntent: plan.UpdateIntent,
RolloutLease: plan.RolloutLease,
AuthorityPayload: func() json.RawMessage {
if len(plan.AuthorityPayload) == 0 {
return nil
}
return append(json.RawMessage(nil), plan.AuthorityPayload...)
}(),
AuthoritySignature: func() json.RawMessage {
if len(plan.AuthoritySignature) == 0 {
return nil
}
return append(json.RawMessage(nil), plan.AuthoritySignature...)
}(),
AuthorityQuorum: plan.AuthorityQuorum,
UpdatedAt: time.Now().UTC(),
})
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
@@ -149,14 +175,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
result.Loaded = true
result.Replaced = true
result.RestartNeeded = true
_ = saveUpdateState(resolved.StateDir, UpdateState{
Product: HostAgentUpdateProduct,
CurrentVersion: plan.TargetVersion,
TargetVersion: plan.TargetVersion,
ContainerName: "host-agent-service",
Image: binaryPath,
UpdatedAt: time.Now().UTC(),
})
_ = saveUpdatePlanState(resolved, plan, plan.TargetVersion, "host-agent-service", binaryPath)
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion,
@@ -183,7 +202,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
func (m DockerManager) RunHostAgentUpdateLoop(ctx context.Context, cfg HostAgentUpdateLoopConfig) error {
if cfg.Interval == 0 {
cfg.Interval = time.Hour
cfg.Interval = time.Duration(DefaultUpdateIntervalSec) * time.Second
}
if cfg.InitialDelay < 0 || cfg.Interval < 0 {
return errors.New("host-agent update loop durations must not be negative")
@@ -191,6 +210,9 @@ func (m DockerManager) RunHostAgentUpdateLoop(ctx context.Context, cfg HostAgent
if cfg.Jitter < 0 || cfg.Jitter > 1 {
return errors.New("host-agent update loop jitter must be between 0 and 1")
}
if err := ReconcileSignedUpdateState(cfg.Request.StateDir); err != nil {
return err
}
logf := cfg.Logf
if logf == nil {
logf = func(string, ...any) {}
@@ -202,6 +224,7 @@ func (m DockerManager) RunHostAgentUpdateLoop(ctx context.Context, cfg HostAgent
}
runs := 0
req := cfg.Request
lastTriggerGeneration := currentUpdateTriggerGeneration(req.StateDir)
for {
runs++
result, err := m.ApplyHostAgentUpdate(ctx, req)
@@ -210,6 +233,7 @@ func (m DockerManager) RunHostAgentUpdateLoop(ctx context.Context, cfg HostAgent
logf("host_agent_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, req.StateDir)
} else {
logf("host_agent_update_loop run=%d status=failed error=%v", runs, err)
saveUpdateLoopRescueState(req.updateRequest(), "host_agent_self_update_failed", err)
if cfg.StopOnError {
return err
}
@@ -231,7 +255,7 @@ func (m DockerManager) RunHostAgentUpdateLoop(ctx context.Context, cfg HostAgent
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
return nil
}
if err := sleepContext(ctx, jitteredDuration(cfg.Interval, cfg.Jitter)); err != nil {
if err := sleepUntilUpdateIntervalOrTrigger(ctx, req.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
return err
}
}
@@ -13,6 +13,7 @@ import (
const (
DefaultHostAgentInstallPath = "/usr/local/bin/rap-host-agent"
DefaultSystemdUnitDir = "/etc/systemd/system"
DefaultUpdateIntervalSec = 120
)
type UpdateServiceConfig struct {
@@ -62,7 +63,7 @@ func (m DockerManager) InstallUpdateService(ctx context.Context, cfg UpdateServi
cfg.Product = DefaultUpdateProduct
}
if cfg.IntervalSeconds == 0 {
cfg.IntervalSeconds = 21600
cfg.IntervalSeconds = DefaultUpdateIntervalSec
}
if cfg.Jitter == 0 {
cfg.Jitter = 0.15
@@ -173,8 +174,11 @@ func (m DockerManager) InstallUpdateService(ctx context.Context, cfg UpdateServi
func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
runtimeCfg := cfg.RuntimeConfig.Normalize()
var missing []string
if runtimeCfg.BackendURL == "" && runtimeCfg.FabricRegistryRecordsJSON == "" {
missing = append(missing, "backend-url-or-fabric-registry-records-json")
if runtimeCfg.FabricRegistryRecordsJSON == "" {
missing = append(missing, "fabric-registry-records-json")
}
if runtimeCfg.ClusterAuthorityPublicKey == "" {
missing = append(missing, "cluster-authority-public-key")
}
if runtimeCfg.ClusterID == "" {
missing = append(missing, "cluster-id")
@@ -201,13 +205,10 @@ func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
"--health-timeout-seconds", fmt.Sprintf("%d", cfg.HealthTimeoutSec),
}
if runtimeCfg.BackendURL != "" {
args = append(args, "--backend-url", runtimeCfg.BackendURL)
}
if strings.TrimSpace(cfg.Channel) != "" {
args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
}
args = appendFabricUpdateArgs(args, runtimeCfg)
args = appendFabricUpdateArgs(args, runtimeCfg, true)
execStart := systemdJoin(args)
return fmt.Sprintf(`[Unit]
Description=RAP host-agent updater for %s
@@ -228,8 +229,8 @@ WantedBy=multi-user.target
func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, string, error) {
runtimeCfg := cfg.RuntimeConfig.Normalize()
if (runtimeCfg.BackendURL == "" && runtimeCfg.FabricRegistryRecordsJSON == "") || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
return "", "", "", fmt.Errorf("backend-url-or-fabric-registry-records-json, cluster-id, and state-dir are required for host-agent self updater")
if runtimeCfg.FabricRegistryRecordsJSON == "" || runtimeCfg.ClusterAuthorityPublicKey == "" || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
return "", "", "", fmt.Errorf("fabric-registry-records-json, cluster-authority-public-key, cluster-id, and state-dir are required for host-agent self updater")
}
unitName := "rap-host-agent-self-updater.service"
unitPath := filepath.Join(firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir), unitName)
@@ -245,13 +246,10 @@ func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, stri
"--initial-delay-seconds", fmt.Sprintf("%d", cfg.InitialDelaySeconds+30),
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
}
if runtimeCfg.BackendURL != "" {
args = append(args, "--backend-url", runtimeCfg.BackendURL)
}
if strings.TrimSpace(cfg.Channel) != "" {
args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
}
args = appendFabricUpdateArgs(args, runtimeCfg)
args = appendFabricUpdateArgs(args, runtimeCfg, true)
return fmt.Sprintf(`[Unit]
Description=RAP host-agent self updater
After=network-online.target docker.service
@@ -271,8 +269,8 @@ WantedBy=multi-user.target
func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string, error) {
runtimeCfg := cfg.RuntimeConfig.Normalize()
if (runtimeCfg.BackendURL == "" && runtimeCfg.FabricRegistryRecordsJSON == "") || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
return "", "", "", fmt.Errorf("backend-url-or-fabric-registry-records-json, cluster-id, and state-dir are required for host monitor")
if runtimeCfg.FabricRegistryRecordsJSON == "" || runtimeCfg.ClusterAuthorityPublicKey == "" || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
return "", "", "", fmt.Errorf("fabric-registry-records-json, cluster-authority-public-key, cluster-id, and state-dir are required for host monitor")
}
containers := uniqueTrimmed(append([]string{runtimeCfg.ContainerName}, cfg.MonitorContainers...))
if len(containers) == 0 {
@@ -291,9 +289,6 @@ func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string,
"--disk-cleanup-percent", fmt.Sprintf("%d", firstNonZero(cfg.MonitorDiskCleanup, DefaultMonitorDiskCleanupPercent)),
"--disk-critical-percent", fmt.Sprintf("%d", firstNonZero(cfg.MonitorDiskCritical, DefaultMonitorDiskCriticalPercent)),
}
if runtimeCfg.BackendURL != "" {
args = append(args, "--backend-url", runtimeCfg.BackendURL)
}
if cfg.MonitorCleanupDocker {
args = append(args, "--cleanup-docker")
}
@@ -303,7 +298,7 @@ func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string,
for _, container := range containers {
args = append(args, "--watch-container", container)
}
args = appendFabricUpdateArgs(args, runtimeCfg)
args = appendFabricUpdateArgs(args, runtimeCfg, true)
return fmt.Sprintf(`[Unit]
Description=RAP host-agent monitor for %s
After=network-online.target docker.service
@@ -321,13 +316,25 @@ WantedBy=multi-user.target
`, runtimeCfg.ContainerName, systemdJoin(args)), unitName, unitPath, nil
}
func appendFabricUpdateArgs(args []string, runtimeCfg RuntimeConfig) []string {
if strings.TrimSpace(runtimeCfg.FabricRegistryRecordsJSON) != "" {
func appendFabricUpdateArgs(args []string, runtimeCfg RuntimeConfig, includeStructured bool) []string {
if includeStructured && strings.TrimSpace(runtimeCfg.FabricRegistryRecordsJSON) != "" {
args = append(args, "--fabric-registry-records-json", strings.TrimSpace(runtimeCfg.FabricRegistryRecordsJSON))
}
if strings.TrimSpace(runtimeCfg.ClusterAuthorityPublicKey) != "" {
args = append(args, "--cluster-authority-public-key", strings.TrimSpace(runtimeCfg.ClusterAuthorityPublicKey))
}
if strings.TrimSpace(runtimeCfg.MeshRegion) != "" {
args = append(args, "--mesh-region", strings.TrimSpace(runtimeCfg.MeshRegion))
}
if strings.TrimSpace(runtimeCfg.MeshSiteID) != "" {
args = append(args, "--mesh-site-id", strings.TrimSpace(runtimeCfg.MeshSiteID))
}
if strings.TrimSpace(runtimeCfg.MeshLocalityGroupID) != "" {
args = append(args, "--mesh-locality-group-id", strings.TrimSpace(runtimeCfg.MeshLocalityGroupID))
}
if strings.TrimSpace(runtimeCfg.MeshNATGroupID) != "" {
args = append(args, "--mesh-nat-group-id", strings.TrimSpace(runtimeCfg.MeshNATGroupID))
}
return args
}
@@ -18,11 +18,14 @@ func TestInstallUpdateServiceWritesSystemdUnit(t *testing.T) {
binaryPath := filepath.Join(dir, "bin", "rap-host-agent")
result, err := (DockerManager{}).InstallUpdateService(context.Background(), UpdateServiceConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
NodeName: "node-a",
ContainerName: "rap-node-agent-node-a",
StateDir: "/var/lib/rap/nodes/node-a",
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
NodeName: "node-a",
ContainerName: "rap-node-agent-node-a",
StateDir: "/var/lib/rap/nodes/node-a",
MeshSiteID: "home",
MeshLocalityGroupID: "home-lan",
},
CurrentVersion: "0.1.0-current",
IntervalSeconds: 60,
@@ -51,8 +54,11 @@ func TestInstallUpdateServiceWritesSystemdUnit(t *testing.T) {
for _, want := range []string{
"ExecStart=",
" update-loop",
"--backend-url http://control/api/v1",
`--fabric-registry-records-json "[{\"schema\":\"rap.fabric.registry.gossip_record.v1\",\"service_class\":\"control-api\"}]"`,
`--cluster-authority-public-key authority-key-b64`,
"--cluster-id cluster-1",
"--mesh-site-id home",
"--mesh-locality-group-id home-lan",
"--state-dir /var/lib/rap/nodes/node-a",
"--container-name rap-node-agent-node-a",
"--current-version 0.1.0-current",
@@ -76,6 +82,9 @@ func TestInstallUpdateServiceWritesSystemdUnit(t *testing.T) {
if text := string(selfUnit); !strings.Contains(text, "update-host-agent-loop") || !strings.Contains(text, "--current-version 0.1.0-host") {
t.Fatalf("unexpected self unit:\n%s", text)
}
if text := string(selfUnit); !strings.Contains(text, "--fabric-registry-records-json") {
t.Fatalf("unexpected self updater unit structured args:\n%s", text)
}
if result.MonitorUnitName == "" || result.MonitorUnitPath == "" {
t.Fatalf("monitor result = %+v", result)
}
@@ -95,13 +104,57 @@ func TestInstallUpdateServiceWritesSystemdUnit(t *testing.T) {
t.Fatalf("monitor unit missing %q:\n%s", want, monitorText)
}
}
if !strings.Contains(monitorText, "--fabric-registry-records-json") {
t.Fatalf("unexpected monitor unit structured args:\n%s", monitorText)
}
}
func TestInstallUpdateServiceDefaultsToRescuePollInterval(t *testing.T) {
dir := t.TempDir()
source := filepath.Join(dir, "rap-host-agent-src")
if err := os.WriteFile(source, []byte("binary"), 0o755); err != nil {
t.Fatalf("write source: %v", err)
}
result, err := (DockerManager{}).InstallUpdateService(context.Background(), UpdateServiceConfig{
RuntimeConfig: RuntimeConfig{
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
ContainerName: "rap-node-agent-node-a",
StateDir: "/var/lib/rap/nodes/node-a",
},
CurrentVersion: "0.1.0-current",
SourceBinaryPath: source,
BinaryInstallPath: filepath.Join(dir, "bin", "rap-host-agent"),
UnitDir: filepath.Join(dir, "systemd"),
ManageSystemd: false,
InstallSelfUpdater: true,
})
if err != nil {
t.Fatalf("install update service: %v", err)
}
unit, err := os.ReadFile(result.UnitPath)
if err != nil {
t.Fatalf("read update unit: %v", err)
}
if !strings.Contains(string(unit), "--interval-seconds 120") {
t.Fatalf("update unit should default to rescue poll interval:\n%s", unit)
}
selfUnit, err := os.ReadFile(result.SelfUnitPath)
if err != nil {
t.Fatalf("read self update unit: %v", err)
}
if !strings.Contains(string(selfUnit), "--interval-seconds 120") {
t.Fatalf("self update unit should default to rescue poll interval:\n%s", selfUnit)
}
}
func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
cfg := WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
},
NodeID: "node-1",
AutoUpdateCurrentVersion: "0.1.2",
@@ -117,10 +170,11 @@ func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
}
script := windowsHostAgentUpdateScript(`C:\Program Files\RAP\win-a\rap-host-agent.exe`, cfg, result)
for _, want := range []string{
":loop",
"RAP_HOST_AGENT_UPDATE_LOCK_DIR",
"rap-host-agent.exe.next",
"update-loop --cluster-id",
"--backend-url \"http://control/api/v1\"",
"update-loop --max-runs 1 --cluster-id",
`--fabric-registry-records-json [{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
"--cluster-authority-public-key authority-key-b64",
"--cluster-id \"cluster-1\"",
"--node-id \"node-1\"",
"--state-dir \"C:\\ProgramData\\RAP\\nodes\\win-a\"",
@@ -131,7 +185,7 @@ func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
"--current-version 0.1.2",
"--host-agent-current-version 0.1.2",
"--interval-seconds 120",
"timeout /t 120",
"wake-interval-seconds 120",
} {
if !strings.Contains(script, want) {
t.Fatalf("script missing %q:\n%s", want, script)
@@ -139,12 +193,12 @@ func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
}
}
func TestWindowsHostAgentUpdateScriptOmitsEmptyBackendURL(t *testing.T) {
func TestWindowsHostAgentUpdateScriptIncludesFabricRegistry(t *testing.T) {
cfg := WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
ClusterID: "cluster-1",
FabricRegistryRecordsJSON: `[{"record_id":"r1"}]`,
MeshRegion: "ru-msk",
ClusterID: "cluster-1",
FabricRegistryRecordsJSON: `[{"record_id":"r1"}]`,
MeshRegion: "ru-msk",
},
AutoUpdateCurrentVersion: "0.1.2",
}
@@ -155,9 +209,6 @@ func TestWindowsHostAgentUpdateScriptOmitsEmptyBackendURL(t *testing.T) {
TaskName: "RAP Node Agent win-a",
}
script := windowsHostAgentUpdateScript(`C:\Program Files\RAP\win-a\rap-host-agent.exe`, cfg, result)
if strings.Contains(script, "--backend-url") {
t.Fatalf("script must not include backend-url when it is empty:\n%s", script)
}
for _, want := range []string{
`--fabric-registry-records-json [{"record_id":"r1"}]`,
"--mesh-region ru-msk",
@@ -171,9 +222,10 @@ func TestWindowsHostAgentUpdateScriptOmitsEmptyBackendURL(t *testing.T) {
func TestWindowsInstallReplaceAllowsExistingNodeWithoutJoinToken(t *testing.T) {
result, err := (WindowsManager{}).Install(context.Background(), WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
NodeName: "win-a",
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
NodeName: "win-a",
},
InstallDir: `C:\Program Files\RAP\win-a`,
Replace: true,
@@ -202,8 +254,9 @@ func TestWindowsRepairUpdaterStartsFromUnknownVersion(t *testing.T) {
StartupMode: "user-task",
}, WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
},
Replace: true,
AutoUpdateEnabled: true,
@@ -219,4 +272,57 @@ func TestWindowsRepairUpdaterStartsFromUnknownVersion(t *testing.T) {
if !strings.Contains(string(script), "--current-version 0.0.0") {
t.Fatalf("repair updater should force unknown current version:\n%s", script)
}
if !strings.Contains(string(script), "--max-runs 1") {
t.Fatalf("repair updater should run one-shot update-loop:\n%s", script)
}
if !strings.Contains(string(script), "RAP_HOST_AGENT_UPDATE_LOCK_DIR") {
t.Fatalf("repair updater should guard against overlapping runs:\n%s", script)
}
if !strings.Contains(string(script), "--interval-seconds 120") {
t.Fatalf("repair updater should use rescue poll interval:\n%s", script)
}
if !strings.Contains(string(script), "wake-interval-seconds 120") {
t.Fatalf("repair updater should document wake interval:\n%s", script)
}
}
func TestWindowsRepairUpdaterUsesRecurringScheduledTask(t *testing.T) {
dir := t.TempDir()
source := filepath.Join(dir, "rap-host-agent.exe")
if err := os.WriteFile(source, []byte("binary"), 0o755); err != nil {
t.Fatalf("write source: %v", err)
}
runner := &recordingRunner{}
_, err := installWindowsHostAgentUpdater(context.Background(), WindowsManager{Runner: runner}, WindowsInstallResult{
NodeName: "win-a",
InstallDir: dir,
StateDir: dir,
NodeAgentPath: filepath.Join(dir, "rap-node-agent.exe"),
TaskName: "RAP Node Agent win-a",
StartupMode: "user-task",
}, WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
ClusterID: "cluster-1",
},
Replace: true,
AutoUpdateEnabled: true,
AutoUpdateIntervalSeconds: 21600,
HostAgentSourcePath: source,
})
if err != nil {
t.Fatalf("install updater: %v", err)
}
foundMinuteTask := false
for _, call := range runner.calls {
if len(call) >= 8 && call[0] == "schtasks" && call[1] == "/Create" {
joined := strings.Join(call, " ")
if strings.Contains(joined, "/SC MINUTE") && strings.Contains(joined, "/MO 5") {
foundMinuteTask = true
break
}
}
}
if !foundMinuteTask {
t.Fatalf("expected recurring minute task, got %#v", runner.calls)
}
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -58,13 +58,14 @@ func WindowsInstallConfigFromProfile(profile WindowsInstallProfile) WindowsInsta
stateDir := firstNonEmpty(profile.StateDir, filepath.Join(DefaultWindowsStateRoot, safeUnitSlug(profile.NodeName)))
return WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: profile.BackendURL,
ClusterAuthorityPublicKey: strings.TrimSpace(profile.ClusterAuthorityPublicKey),
FabricRegistryRecordsJSON: strings.TrimSpace(string(profile.FabricRegistryRecords)),
ClusterID: profile.ClusterID,
JoinToken: profile.JoinToken,
NodeName: profile.NodeName,
StateDir: stateDir,
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
FabricRuntimeEnabled: profile.FabricRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled,
MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled,
@@ -72,15 +73,18 @@ func WindowsInstallConfigFromProfile(profile WindowsInstallProfile) WindowsInsta
VPNFabricSessionStreamShards: profile.VPNFabricSessionStreamShards,
VPNFabricQUICMaxStreamsPerConn: profile.VPNFabricQUICMaxStreamsPerConn,
VPNFabricQUICIdleTTLSeconds: profile.VPNFabricQUICIdleTTLSeconds,
MeshListenAddr: profile.MeshListenAddr,
MeshListenPortMode: profile.MeshListenPortMode,
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
FabricListenAddr: profile.FabricListenAddr,
FabricListenPortMode: profile.FabricListenPortMode,
FabricListenAutoPortStart: profile.FabricListenAutoPortStart,
FabricListenAutoPortEnd: profile.FabricListenAutoPortEnd,
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
MeshConnectivityMode: profile.MeshConnectivityMode,
MeshNATType: profile.MeshNATType,
MeshSiteID: profile.MeshSiteID,
MeshLocalityGroupID: firstNonEmpty(profile.MeshLocalityGroupID, profile.MeshSiteID),
MeshNATGroupID: profile.MeshNATGroupID,
MeshRegion: profile.MeshRegion,
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
@@ -2,10 +2,12 @@ package hostagent
import (
"context"
"encoding/json"
"errors"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"time"
)
@@ -42,12 +44,22 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
}
if plan.Action != "update" {
if !req.DryRun {
restarted, err := rewriteWindowsControlPlaneRuntime(ctx, runner, m, req, plan)
if err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "rewrite_runtime", "failed", err))
return result, err
}
result.RestartNeeded = restarted
}
if !req.DryRun {
_ = saveUpdatePlanState(req, plan, req.CurrentVersion, req.WindowsTaskName, req.BinaryPath)
status := statusFromNoopPlan(req, plan)
if status.Payload == nil {
status.Payload = map[string]any{}
}
status.Payload["task"] = req.WindowsTaskName
status.Payload["binary_path"] = req.BinaryPath
status.Payload["restart_needed"] = result.RestartNeeded
_ = ReportNodeUpdateStatusForRequest(ctx, req, status)
}
return result, nil
@@ -78,9 +90,8 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
Status: "accepted",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"strategy": plan.Strategy, "reason": plan.Reason, "task": req.WindowsTaskName},
Payload: updatePlanStatusPayload(plan),
})
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
@@ -89,14 +100,24 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
Status: "started",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath},
Payload: map[string]any{"artifact_id": plan.Artifact.ID, "binary_path": req.BinaryPath, "transport": updateArtifactTransport(req, plan)},
})
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
path, distributors, err := downloadUpdateArtifact(ctx, req, plan)
if err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "download", "failed", err))
return result, err
}
defer os.Remove(path)
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "download",
Status: "succeeded",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"artifact_id": plan.Artifact.ID, "binary_path": req.BinaryPath, "fabric_distributors": distributors, "transport": updateArtifactTransport(req, plan)},
})
m.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
m.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
@@ -106,10 +127,18 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
}
}
result.Replaced = true
if _, err := runner.Run(ctx, "schtasks", "/Run", "/TN", req.WindowsTaskName); err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "restart", "failed", err))
restartedByRewrite, err := rewriteWindowsControlPlaneRuntime(ctx, runner, m, req, plan)
if err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "rewrite_runtime", "failed", err))
return result, err
}
result.RestartNeeded = restartedByRewrite
if !restartedByRewrite {
if _, err := runner.Run(ctx, "schtasks", "/Run", "/TN", req.WindowsTaskName); err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "restart", "failed", err))
return result, err
}
}
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
@@ -120,16 +149,105 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"task": req.WindowsTaskName, "binary_path": req.BinaryPath},
})
_ = saveUpdateState(req.StateDir, UpdateState{
Product: req.Product,
CurrentVersion: plan.TargetVersion,
TargetVersion: plan.TargetVersion,
Image: req.BinaryPath,
UpdatedAt: time.Now().UTC(),
})
_ = saveUpdatePlanState(req, plan, plan.TargetVersion, req.WindowsTaskName, req.BinaryPath)
return result, nil
}
func rewriteWindowsControlPlaneRuntime(ctx context.Context, runner CommandRunner, manager WindowsManager, req UpdateRequest, plan NodeUpdatePlan) (bool, error) {
_ = saveControlPlaneRuntimeState(req.StateDir, ControlPlaneRuntimeState{
SchemaVersion: "rap.control_plane_runtime_state.v1",
ClusterID: strings.TrimSpace(plan.ClusterID),
NodeID: strings.TrimSpace(plan.NodeID),
Product: strings.TrimSpace(plan.Product),
FabricRegistryRecords: append(json.RawMessage(nil), plan.FabricRegistryRecords...),
AuthorityPayload: append(json.RawMessage(nil), plan.AuthorityPayload...),
AuthoritySignature: append(json.RawMessage(nil), plan.AuthoritySignature...),
AuthorityQuorum: plan.AuthorityQuorum,
UpdatedAt: time.Now().UTC(),
})
installDir := filepath.Dir(strings.TrimSpace(req.BinaryPath))
if installDir == "" {
return false, nil
}
envPath := filepath.Join(installDir, "rap-node-agent.env.cmd")
envRegistry := strings.TrimSpace(string(plan.FabricRegistryRecords))
changed := false
if envRegistry != "" && fileExists(envPath) {
current, err := os.ReadFile(envPath)
if err != nil {
return false, err
}
updatedEnv := string(current)
updatedEnv = upsertWindowsEnvValue(updatedEnv, "RAP_FABRIC_REGISTRY_RECORDS_JSON", envRegistry)
if updatedEnv != string(current) {
if err := os.WriteFile(envPath, []byte(updatedEnv), 0o600); err != nil {
return false, err
}
changed = true
}
}
if envRegistry == "" {
return false, nil
}
wrapperPath := filepath.Join(installDir, "rap-host-agent-update.cmd")
if !fileExists(wrapperPath) {
if changed {
manager.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
if _, err := runner.Run(ctx, "schtasks", "/Run", "/TN", req.WindowsTaskName); err != nil {
return false, err
}
}
return changed, nil
}
script, err := os.ReadFile(wrapperPath)
if err != nil {
return false, err
}
updated := replaceCLIArg(string(script), "--fabric-registry-records-json", envRegistry, true)
if updated != string(script) {
if err := os.WriteFile(wrapperPath, []byte(updated), 0o755); err != nil {
return false, err
}
changed = true
}
if changed {
manager.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
if _, err := runner.Run(ctx, "schtasks", "/Run", "/TN", req.WindowsTaskName); err != nil {
return false, err
}
}
return changed, nil
}
func upsertWindowsEnvValue(script string, key string, value string) string {
prefix := "set " + key + "="
lines := strings.Split(script, "\n")
for i, line := range lines {
rawLine := strings.TrimRight(line, "\r")
trimmed := strings.TrimSpace(rawLine)
if strings.HasPrefix(strings.ToLower(trimmed), strings.ToLower(prefix)) {
if value == "" {
lines = append(lines[:i], lines[i+1:]...)
} else {
lines[i] = prefix + value
}
return strings.Join(lines, "\n")
}
}
if value == "" {
return script
}
insertAt := len(lines)
for i, line := range lines {
if strings.EqualFold(strings.TrimSpace(strings.TrimRight(line, "\r")), "@echo off") {
insertAt = i + 1
break
}
}
lines = append(lines[:insertAt], append([]string{prefix + value}, lines[insertAt:]...)...)
return strings.Join(lines, "\n")
}
func (m WindowsManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) error {
req := cfg.Request
if strings.TrimSpace(req.InstallType) == "" || req.InstallType == DefaultUpdateInstallType {
@@ -141,6 +259,9 @@ func (m WindowsManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig)
if err := req.Validate(); err != nil {
return err
}
if err := ReconcileSignedUpdateState(req.StateDir); err != nil {
return err
}
if cfg.Interval == 0 {
cfg.Interval = time.Hour
}
@@ -179,6 +300,7 @@ func (m WindowsManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig)
continue
}
logf("windows_update_loop run=%d status=failed error=%v", runs, err)
saveUpdateLoopRescueState(req, "windows_node_agent_update_failed", err)
if cfg.StopOnError {
return err
}
@@ -197,10 +319,12 @@ func (m WindowsManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig)
}
if cfg.HostAgentUpdateEnabled {
hostReq := cfg.HostAgentUpdateRequest
hostReq.BackendURL = firstNonEmpty(hostReq.BackendURL, req.BackendURL)
hostReq.ClusterID = firstNonEmpty(hostReq.ClusterID, req.ClusterID)
hostReq.NodeID = firstNonEmpty(hostReq.NodeID, req.NodeID)
hostReq.StateDir = firstNonEmpty(hostReq.StateDir, req.StateDir)
hostReq.ClusterAuthorityPublicKey = firstNonEmpty(hostReq.ClusterAuthorityPublicKey, req.ClusterAuthorityPublicKey)
hostReq.FabricRegistryRecordsJSON = firstNonEmpty(hostReq.FabricRegistryRecordsJSON, req.FabricRegistryRecordsJSON)
hostReq.MeshRegion = firstNonEmpty(hostReq.MeshRegion, req.MeshRegion)
hostReq.Channel = firstNonEmpty(hostReq.Channel, req.Channel)
hostReq.OS = firstNonEmpty(hostReq.OS, "windows")
hostReq.Arch = firstNonEmpty(hostReq.Arch, "amd64")
@@ -211,6 +335,7 @@ func (m WindowsManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig)
logf("windows_host_agent_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, hostReq.StateDir)
} else {
logf("windows_host_agent_update_loop run=%d status=failed error=%v", runs, hostErr)
saveUpdateLoopRescueState(req, "windows_host_agent_update_failed", hostErr)
if cfg.StopOnError {
return hostErr
}
@@ -257,7 +382,7 @@ func installWindowsHostAgentUpdater(ctx context.Context, m WindowsManager, resul
if err := os.WriteFile(wrapperPath, []byte(script), 0o755); err != nil {
return result, err
}
started, fallback, mode, err := m.installStartupTask(ctx, taskName, wrapperPath, logPath, cfg.StartupMode)
started, fallback, mode, err := m.installRecurringUpdaterTask(ctx, taskName, wrapperPath, logPath, cfg.StartupMode, windowsUpdaterWakeIntervalSeconds(cfg.AutoUpdateIntervalSeconds))
if err != nil {
return result, err
}
@@ -277,7 +402,7 @@ func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig
currentVersion := firstNonEmpty(cfg.AutoUpdateCurrentVersion, "0.0.0")
interval := cfg.AutoUpdateIntervalSeconds
if interval == 0 {
interval = 21600
interval = DefaultUpdateIntervalSec
}
initialDelay := cfg.AutoUpdateInitialDelaySeconds
if initialDelay == 0 {
@@ -290,6 +415,7 @@ func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig
updateLoopArgs := []string{
`"` + hostAgentPath + `"`,
"update-loop",
"--max-runs", "1",
"--cluster-id", `"` + cfg.RuntimeConfig.ClusterID + `"`,
"--state-dir", `"` + result.StateDir + `"`,
"--current-version", currentVersion,
@@ -305,10 +431,7 @@ func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig
"--host-agent-current-version", currentVersion,
"--host-agent-binary-path", `"` + hostAgentPath + `"`,
}
if strings.TrimSpace(cfg.RuntimeConfig.BackendURL) != "" {
updateLoopArgs = append(updateLoopArgs, "--backend-url", `"`+strings.TrimSpace(cfg.RuntimeConfig.BackendURL)+`"`)
}
updateLoopArgs = appendFabricUpdateArgs(updateLoopArgs, cfg.RuntimeConfig)
updateLoopArgs = appendFabricUpdateArgs(updateLoopArgs, cfg.RuntimeConfig, true)
if strings.TrimSpace(cfg.NodeID) != "" {
updateLoopArgs = append(updateLoopArgs, "--node-id", `"`+strings.TrimSpace(cfg.NodeID)+`"`)
}
@@ -320,21 +443,70 @@ func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig
"setlocal",
"set RAP_HOST_AGENT=" + `"` + hostAgentPath + `"`,
"set RAP_HOST_AGENT_NEXT=" + `"` + hostAgentPath + `.next"`,
}
if initialDelay > 0 {
lines = append(lines, "timeout /t "+fmt.Sprintf("%d", initialDelay)+" /nobreak >NUL")
"set RAP_HOST_AGENT_UPDATE_LOCK_DIR=" + `"` + filepath.Join(result.StateDir, "rap-host-agent-update.lock") + `"`,
}
lines = append(lines, []string{
":loop",
"2>nul mkdir %RAP_HOST_AGENT_UPDATE_LOCK_DIR%",
"if errorlevel 1 goto :eof",
"if exist %RAP_HOST_AGENT_NEXT% (",
" copy /Y %RAP_HOST_AGENT_NEXT% %RAP_HOST_AGENT% >NUL",
" if %ERRORLEVEL% EQU 0 del /F /Q %RAP_HOST_AGENT_NEXT%",
")",
}...)
if initialDelay > 0 {
lines = append(lines, "timeout /t "+fmt.Sprintf("%d", initialDelay)+" /nobreak >NUL")
}
lines = append(lines, []string{
strings.Join(updateLoopArgs, " "),
"timeout /t " + fmt.Sprintf("%d", interval) + " /nobreak >NUL",
"goto loop",
"endlocal",
"set RAP_HOST_AGENT_UPDATE_EXIT_CODE=%ERRORLEVEL%",
"rmdir /S /Q %RAP_HOST_AGENT_UPDATE_LOCK_DIR% >NUL 2>&1",
"endlocal & exit /b %RAP_HOST_AGENT_UPDATE_EXIT_CODE%",
"rem initial-delay-seconds " + fmt.Sprintf("%d", initialDelay),
"rem wake-interval-seconds " + strconv.Itoa(windowsUpdaterWakeIntervalSeconds(interval)),
}...)
return strings.Join(lines, "\r\n") + "\r\n"
}
func windowsUpdaterWakeIntervalSeconds(intervalSeconds int) int {
if intervalSeconds <= 0 {
return 300
}
if intervalSeconds > 300 {
return 300
}
return intervalSeconds
}
func (m WindowsManager) installRecurringUpdaterTask(ctx context.Context, taskName, wrapperPath, logPath, mode string, intervalSeconds int) (bool, bool, string, error) {
if strings.EqualFold(mode, "none") {
return false, false, mode, nil
}
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
intervalMinutes := intervalSeconds / 60
if intervalSeconds%60 != 0 {
intervalMinutes++
}
if intervalMinutes <= 0 {
intervalMinutes = 1
}
action := windowsTaskAction(wrapperPath, logPath)
if mode == "auto" || mode == "system-task" {
_, err := runner.Run(ctx, "schtasks", "/Create", "/TN", taskName, "/SC", "MINUTE", "/MO", strconv.Itoa(intervalMinutes), "/RU", "SYSTEM", "/RL", "HIGHEST", "/TR", action, "/F")
if err == nil {
_, _ = runner.Run(ctx, "schtasks", "/Run", "/TN", taskName)
return true, false, "system-task", nil
}
if mode == "system-task" {
return false, false, mode, err
}
}
_, err := runner.Run(ctx, "schtasks", "/Create", "/TN", taskName, "/SC", "MINUTE", "/MO", strconv.Itoa(intervalMinutes), "/TR", action, "/F")
if err != nil {
return false, mode == "auto", "user-task", err
}
_, _ = runner.Run(ctx, "schtasks", "/Run", "/TN", taskName)
return true, mode == "auto", "user-task", nil
}
@@ -1,111 +0,0 @@
package mesh
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"time"
)
type Client struct {
BaseURL string
HTTPClient *http.Client
}
func NewClient(baseURL string) Client {
return Client{
BaseURL: baseURL,
HTTPClient: &http.Client{
Timeout: 5 * time.Second,
},
}
}
func (c Client) SendHealth(ctx context.Context, message HealthMessage) (HealthAck, error) {
payload, err := json.Marshal(message)
if err != nil {
return HealthAck{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/health", bytes.NewReader(payload))
if err != nil {
return HealthAck{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return HealthAck{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return HealthAck{}, fmt.Errorf("mesh health rejected with status %d", resp.StatusCode)
}
var ack HealthAck
if err := json.NewDecoder(resp.Body).Decode(&ack); err != nil {
return HealthAck{}, err
}
return ack, nil
}
func (c Client) SendSynthetic(ctx context.Context, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
payload, err := json.Marshal(envelope)
if err != nil {
return SyntheticEnvelope{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/synthetic/probe", bytes.NewReader(payload))
if err != nil {
return SyntheticEnvelope{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return SyntheticEnvelope{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return SyntheticEnvelope{}, fmt.Errorf("mesh synthetic probe rejected with status %d", resp.StatusCode)
}
var ack SyntheticEnvelope
if err := json.NewDecoder(resp.Body).Decode(&ack); err != nil {
return SyntheticEnvelope{}, err
}
return ack, nil
}
func (c Client) SendProduction(ctx context.Context, envelope ProductionEnvelope) (ProductionForwardResult, error) {
payload, err := json.Marshal(envelope)
if err != nil {
return ProductionForwardResult{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/forward", bytes.NewReader(payload))
if err != nil {
return ProductionForwardResult{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return ProductionForwardResult{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return ProductionForwardResult{}, fmt.Errorf("mesh production forward rejected with status %d", resp.StatusCode)
}
var result ProductionForwardResult
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return ProductionForwardResult{}, err
}
return result, nil
}
@@ -70,7 +70,7 @@ const (
FabricServiceChannelReliable = "reliable"
FabricServiceChannelDroppable = "droppable"
MaxProductionEnvelopePayloadBytes = 4096
MaxProductionVPNPacketPayloadBytes = 256 * 1024
MaxProductionVPNPacketPayloadBytes = 8 * 1024 * 1024
MaxProductionEnvelopeFutureSkew = time.Minute
ProductionForwardQUICStreamID = 1
WebIngressForwardQUICStreamID = 2
@@ -203,22 +203,6 @@ type SyntheticRelayQueueMetrics struct {
QueueDepths map[string]int `json:"queue_depths"`
}
type HealthMessage struct {
ProtocolVersion string `json:"protocol_version"`
From PeerIdentity `json:"from"`
To PeerIdentity `json:"to"`
ObservedAt time.Time `json:"observed_at"`
LinkStatus string `json:"link_status"`
LatencyMs *int `json:"latency_ms,omitempty"`
QualityScore *int `json:"quality_score,omitempty"`
}
type HealthAck struct {
ProtocolVersion string `json:"protocol_version"`
Accepted bool `json:"accepted"`
By PeerIdentity `json:"by"`
}
type ProductionEnvelope struct {
FabricProtocolVersion string `json:"fabric_protocol_version"`
MessageID string `json:"message_id"`
@@ -1,6 +1,7 @@
package mesh
import (
"encoding/json"
"sort"
"strings"
"time"
@@ -9,6 +10,9 @@ import (
type EndpointCandidateScoreOptions struct {
ChannelClass string
PreferredRegion string
SiteID string
LocalityGroupID string
LocalNATGroupID string
Now time.Time
MaxVerificationAge time.Duration
Observations map[string]EndpointCandidateHealthObservation
@@ -21,6 +25,7 @@ type EndpointCandidateHealthObservation struct {
EndpointID string `json:"endpoint_id"`
Source string `json:"source,omitempty"`
ReporterNodeID string `json:"reporter_node_id,omitempty"`
ReporterRegion string `json:"reporter_region,omitempty"`
LastLatencyMs int64 `json:"last_latency_ms,omitempty"`
SuccessCount uint64 `json:"success_count,omitempty"`
FailureCount uint64 `json:"failure_count,omitempty"`
@@ -114,6 +119,9 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
case "direct":
score += 30
reasons = append(reasons, "connectivity:direct")
case "private_lan":
score += 36
reasons = append(reasons, "connectivity:private_lan")
case "outbound_only":
score += 5
reasons = append(reasons, "connectivity:outbound_only")
@@ -167,6 +175,7 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
score += 18
reasons = append(reasons, "policy:private-lan")
}
score, reasons = applyLocalityPreferences(candidate, opts, score, reasons)
if hasPolicyTag(candidate.PolicyTags, "costly") {
score -= 10
reasons = append(reasons, "policy:costly")
@@ -193,7 +202,7 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
}
}
if observation, ok := opts.Observations[candidate.EndpointID]; ok {
observationScore, observationReasons := scoreEndpointCandidateObservation(observation, opts)
observationScore, observationReasons := scoreEndpointCandidateObservation(candidate, observation, opts)
score += observationScore
reasons = append(reasons, observationReasons...)
}
@@ -225,7 +234,7 @@ func scoreEndpointCandidateCapacityPressure(pressure EndpointCandidateCapacityPr
return -penalty, []string{"capacity:pressure"}
}
func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObservation, opts EndpointCandidateScoreOptions) (int, []string) {
func scoreEndpointCandidateObservation(candidate PeerEndpointCandidate, observation EndpointCandidateHealthObservation, opts EndpointCandidateScoreOptions) (int, []string) {
score := 0
reasons := []string{"observation:present"}
if !opts.Now.IsZero() && !observation.ObservedAt.IsZero() && opts.MaxObservationAge > 0 {
@@ -236,6 +245,18 @@ func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObserv
score += 6
reasons = append(reasons, "observation:fresh")
}
observationScope := endpointCandidateObservationScope(candidate, observation, opts)
if observationScope != "" {
reasons = append(reasons, "observation_scope:"+observationScope)
}
if endpointRequiresExternalNetworkVerification(candidate) && (observationScope == "self" || observationScope == "same_area") {
reasons = append(reasons, "observation:non_authoritative_same_area_public")
if strings.TrimSpace(observation.LastFailureReason) == "capacity_limited" {
score -= 4
reasons = append(reasons, "capacity:limited")
}
return score, reasons
}
switch {
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 50:
score += 24
@@ -286,6 +307,118 @@ func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObserv
return score, reasons
}
func endpointCandidateObservationScope(candidate PeerEndpointCandidate, observation EndpointCandidateHealthObservation, opts EndpointCandidateScoreOptions) string {
if strings.TrimSpace(observation.ReporterNodeID) != "" &&
strings.TrimSpace(candidate.NodeID) != "" &&
strings.EqualFold(strings.TrimSpace(observation.ReporterNodeID), strings.TrimSpace(candidate.NodeID)) {
return "self"
}
reporterRegion := strings.TrimSpace(observation.ReporterRegion)
if reporterRegion == "" && strings.EqualFold(strings.TrimSpace(observation.Source), "local_vpn_fabric_session") {
reporterRegion = strings.TrimSpace(opts.PreferredRegion)
}
candidateRegion := strings.TrimSpace(candidate.Region)
if reporterRegion == "" || candidateRegion == "" {
return ""
}
if strings.EqualFold(reporterRegion, candidateRegion) {
return "same_area"
}
return "cross_area"
}
func endpointRequiresExternalNetworkVerification(candidate PeerEndpointCandidate) bool {
if !strings.EqualFold(strings.TrimSpace(candidate.Reachability), "public") {
return false
}
if len(candidate.Metadata) == 0 || !json.Valid(candidate.Metadata) {
return false
}
var metadata struct {
VerificationScope string `json:"verification_scope,omitempty"`
}
if err := json.Unmarshal(candidate.Metadata, &metadata); err != nil {
return false
}
return strings.EqualFold(strings.TrimSpace(metadata.VerificationScope), "external-network-required")
}
func applyLocalityPreferences(candidate PeerEndpointCandidate, opts EndpointCandidateScoreOptions, score int, reasons []string) (int, []string) {
locality := endpointCandidateLocality(candidate, opts)
switch locality {
case "local_segment":
score += 65
reasons = append(reasons, "locality:local_segment")
case "same_nat":
score += 45
reasons = append(reasons, "locality:same_nat")
case "private_scoped":
score += 20
reasons = append(reasons, "locality:private_scoped")
case "private_unscoped":
score -= 35
reasons = append(reasons, "locality:private_unscoped")
case "private_foreign":
score -= 90
reasons = append(reasons, "locality:private_foreign")
case "public_fallback":
score -= 5
reasons = append(reasons, "locality:public_fallback")
}
return score, reasons
}
func endpointCandidateLocality(candidate PeerEndpointCandidate, opts EndpointCandidateScoreOptions) string {
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
isPrivate := reachability == "private" || connectivity == "private_lan" || endpointHasPrivateHost(candidate.Address)
if !isPrivate {
if reachability == "public" && endpointRequiresExternalNetworkVerification(candidate) {
return "public_fallback"
}
return ""
}
metadata := decodeEndpointCandidateLocalityMetadata(candidate.Metadata)
localityGroupID := strings.TrimSpace(opts.LocalityGroupID)
if localityGroupID != "" && strings.TrimSpace(metadata.LocalityGroupID) != "" &&
strings.EqualFold(strings.TrimSpace(metadata.LocalityGroupID), localityGroupID) {
return "local_segment"
}
if opts.LocalNATGroupID != "" && metadata.NATGroupID != "" && strings.EqualFold(metadata.NATGroupID, strings.TrimSpace(opts.LocalNATGroupID)) {
return "same_nat"
}
if strings.TrimSpace(opts.SiteID) != "" && metadata.SiteID != "" && strings.EqualFold(metadata.SiteID, strings.TrimSpace(opts.SiteID)) {
return "private_scoped"
}
if hasPolicyTag(candidate.PolicyTags, "private-lan") || hasPolicyTag(candidate.PolicyTags, "corp-lan") || hasPolicyTag(candidate.PolicyTags, "same-site") {
return "private_scoped"
}
if metadata.LocalityGroupID != "" || metadata.SiteID != "" || metadata.NATGroupID != "" {
return "private_foreign"
}
return "private_unscoped"
}
type endpointCandidateLocalityMetadata struct {
SiteID string `json:"site_id,omitempty"`
LocalityGroupID string `json:"locality_group_id,omitempty"`
NATGroupID string `json:"nat_group_id,omitempty"`
}
func decodeEndpointCandidateLocalityMetadata(raw json.RawMessage) endpointCandidateLocalityMetadata {
if len(raw) == 0 || !json.Valid(raw) {
return endpointCandidateLocalityMetadata{}
}
var metadata endpointCandidateLocalityMetadata
if err := json.Unmarshal(raw, &metadata); err != nil {
return endpointCandidateLocalityMetadata{}
}
metadata.SiteID = strings.TrimSpace(metadata.SiteID)
metadata.LocalityGroupID = strings.TrimSpace(metadata.LocalityGroupID)
metadata.NATGroupID = strings.TrimSpace(metadata.NATGroupID)
return metadata
}
func hasPolicyTag(tags []string, needle string) bool {
for _, tag := range tags {
if strings.EqualFold(strings.TrimSpace(tag), needle) {
@@ -1,6 +1,7 @@
package mesh
import (
"encoding/json"
"testing"
"time"
)
@@ -526,6 +527,161 @@ func TestRankPeerEndpointCandidatesSpreadsFreshCapacityPressure(t *testing.T) {
}
}
func TestRankPeerEndpointCandidatesIgnoresSameAreaPublicVerificationFailures(t *testing.T) {
now := time.Date(2026, 5, 19, 12, 0, 0, 0, time.UTC)
candidate := PeerEndpointCandidate{
EndpointID: "test-1-public",
NodeID: "test-1",
Transport: "direct_quic",
Address: "quic://94.141.118.222:19191",
Reachability: "public",
NATType: "port_restricted",
ConnectivityMode: "direct",
Region: "home-test",
Priority: 2,
Metadata: json.RawMessage(`{"verification_scope":"external-network-required"}`),
}
ranked := RankPeerEndpointCandidates([]PeerEndpointCandidate{candidate}, EndpointCandidateScoreOptions{
PreferredRegion: "home-test",
Now: now,
MaxObservationAge: time.Minute,
Observations: map[string]EndpointCandidateHealthObservation{
"test-1-public": {
EndpointID: "test-1-public",
ReporterNodeID: "home-1",
ReporterRegion: "home-test",
FailureCount: 4,
LastFailureReason: "context_deadline_exceeded",
ReliabilityScore: 20,
ObservedAt: now,
},
},
})
if len(ranked) != 1 {
t.Fatalf("ranked length = %d, want 1", len(ranked))
}
if !containsReason(ranked[0].Reasons, "observation:non_authoritative_same_area_public") {
t.Fatalf("same-area public observation should be non-authoritative: %+v", ranked[0].Reasons)
}
if containsReason(ranked[0].Reasons, "history:failure") || containsReason(ranked[0].Reasons, "failure:recent") {
t.Fatalf("same-area public failures should not demote candidate: %+v", ranked[0].Reasons)
}
}
func TestRankPeerEndpointCandidatesUsesCrossAreaPublicVerificationFailures(t *testing.T) {
now := time.Date(2026, 5, 19, 12, 0, 0, 0, time.UTC)
candidate := PeerEndpointCandidate{
EndpointID: "test-1-public",
NodeID: "test-1",
Transport: "direct_quic",
Address: "quic://94.141.118.222:19191",
Reachability: "public",
NATType: "port_restricted",
ConnectivityMode: "direct",
Region: "home-test",
Priority: 2,
Metadata: json.RawMessage(`{"verification_scope":"external-network-required"}`),
}
ranked := RankPeerEndpointCandidates([]PeerEndpointCandidate{candidate}, EndpointCandidateScoreOptions{
PreferredRegion: "usa",
Now: now,
MaxObservationAge: time.Minute,
Observations: map[string]EndpointCandidateHealthObservation{
"test-1-public": {
EndpointID: "test-1-public",
ReporterNodeID: "usa-los-1",
ReporterRegion: "usa",
FailureCount: 4,
LastFailureReason: "context_deadline_exceeded",
ReliabilityScore: 20,
ObservedAt: now,
},
},
})
if len(ranked) != 1 {
t.Fatalf("ranked length = %d, want 1", len(ranked))
}
if !containsReason(ranked[0].Reasons, "observation_scope:cross_area") {
t.Fatalf("cross-area scope missing: %+v", ranked[0].Reasons)
}
if !containsReason(ranked[0].Reasons, "history:failure") || !containsReason(ranked[0].Reasons, "failure:recent") {
t.Fatalf("cross-area public failures should demote candidate: %+v", ranked[0].Reasons)
}
}
func TestRankPeerEndpointCandidatesPrefersScopedPrivateLANOverPublic(t *testing.T) {
now := time.Date(2026, 5, 19, 13, 0, 0, 0, time.UTC)
ranked := RankPeerEndpointCandidates([]PeerEndpointCandidate{
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://94.141.118.222:19191",
Reachability: "public",
ConnectivityMode: "direct",
NATType: "port_restricted",
Priority: 2,
},
{
EndpointID: "node-b-private",
NodeID: "node-b",
Transport: "lan_quic",
Address: "quic://192.168.200.61:19134",
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 1,
Metadata: json.RawMessage(`{"locality_group_id":"home-test","nat_group_id":"home-router"}`),
},
}, EndpointCandidateScoreOptions{
PreferredRegion: "home-test",
LocalityGroupID: "home-test",
LocalNATGroupID: "home-router",
Now: now,
})
if ranked[0].Candidate.EndpointID != "node-b-private" {
t.Fatalf("top endpoint = %q, want node-b-private: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if !containsReason(ranked[0].Reasons, "locality:local_segment") {
t.Fatalf("missing locality group reason: %+v", ranked[0].Reasons)
}
}
func TestRankPeerEndpointCandidatesPenalizesForeignPrivateEndpoint(t *testing.T) {
now := time.Date(2026, 5, 19, 13, 0, 0, 0, time.UTC)
ranked := RankPeerEndpointCandidates([]PeerEndpointCandidate{
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://94.141.118.222:19191",
Reachability: "public",
ConnectivityMode: "direct",
Priority: 2,
},
{
EndpointID: "node-b-private-foreign",
NodeID: "node-b",
Transport: "lan_quic",
Address: "quic://10.24.10.20:19443",
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 1,
Metadata: json.RawMessage(`{"locality_group_id":"other-site","nat_group_id":"other-nat"}`),
},
}, EndpointCandidateScoreOptions{
PreferredRegion: "home-test",
LocalityGroupID: "home-test",
LocalNATGroupID: "home-router",
Now: now,
})
if ranked[0].Candidate.EndpointID != "node-b-public" {
t.Fatalf("top endpoint = %q, want node-b-public: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if !containsReason(ranked[1].Reasons, "locality:private_foreign") {
t.Fatalf("missing foreign private reason: %+v", ranked[1].Reasons)
}
}
func containsReason(reasons []string, reason string) bool {
for _, item := range reasons {
if item == reason {
@@ -23,7 +23,7 @@ func FabricTransportTargetFromRegistryEndpoint(endpoint FabricRegistryEndpoint)
return FabricTransportTarget{
EndpointID: strings.TrimSpace(endpoint.EndpointID),
PeerID: strings.TrimSpace(endpoint.EndpointID),
Endpoint: strings.TrimSpace(endpoint.Address),
Endpoint: fabricControlEndpointAddress(endpoint),
Transport: strings.TrimSpace(endpoint.Transport),
PeerCertSHA256: strings.TrimSpace(endpoint.PeerCertSHA256),
Timeout: 5 * time.Second,
@@ -32,6 +32,28 @@ func FabricTransportTargetFromRegistryEndpoint(endpoint FabricRegistryEndpoint)
}
}
func fabricControlEndpointAddress(endpoint FabricRegistryEndpoint) string {
if mapped := fabricControlMetadataString(endpoint.Metadata, "maps_to"); mapped != "" {
if strings.Contains(mapped, "://") {
return mapped
}
return "quic://" + mapped
}
return strings.TrimSpace(endpoint.Address)
}
func fabricControlMetadataString(raw json.RawMessage, key string) string {
if len(raw) == 0 {
return ""
}
var metadata map[string]any
if err := json.Unmarshal(raw, &metadata); err != nil {
return ""
}
value, _ := metadata[key].(string)
return strings.TrimSpace(value)
}
func SendFabricControlForward(ctx context.Context, transport FabricTransport, endpoint FabricRegistryEndpoint, payload []byte, timeout time.Duration) (FabricControlForwardResult, error) {
if transport == nil {
return FabricControlForwardResult{}, fmt.Errorf("fabric control transport is unavailable")
@@ -137,7 +137,7 @@ type FabricAdjacency struct {
PressurePercent int
Healthy bool
PassiveOutbound bool
LocalSegmentID string
LocalityGroupID string
NATGroupID string
LastObservedAt time.Time
LastFailureReason string
@@ -0,0 +1,74 @@
package mesh
import (
"context"
"fmt"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
func ProbeFabricTarget(ctx context.Context, target FabricTransportTarget) (time.Duration, error) {
target.Timeout = positiveDurationOr(target.Timeout, 2*time.Second)
target.InboundBuffer = positiveIntOr(target.InboundBuffer, 2)
target.ErrorBuffer = positiveIntOr(target.ErrorBuffer, 2)
transport, normalizedTarget, err := FabricTransportForTarget(target, nil)
if err != nil {
return 0, err
}
session, err := transport.Connect(ctx, normalizedTarget)
if err != nil {
_ = transport.Close()
return 0, err
}
defer func() {
_ = session.Close()
_ = transport.Close()
}()
startedAt := time.Now()
sequence := uint64(startedAt.UnixNano())
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FramePing,
TrafficClass: fabricproto.TrafficClassReliable,
Sequence: sequence,
Payload: []byte("fabric-live-probe"),
}); err != nil {
return 0, err
}
for {
select {
case frame, ok := <-session.Frames():
if !ok {
return 0, fmt.Errorf("fabric live probe session closed")
}
if frame.Type == fabricproto.FramePong && frame.Sequence == sequence {
return time.Since(startedAt), nil
}
case err, ok := <-session.Errors():
if !ok {
return 0, fmt.Errorf("fabric live probe error channel closed")
}
if err != nil {
return 0, err
}
case <-ctx.Done():
return 0, ctx.Err()
}
}
}
func positiveDurationOr(value time.Duration, fallback time.Duration) time.Duration {
if value > 0 {
return value
}
return fallback
}
func positiveIntOr(value int, fallback int) int {
if value > 0 {
return value
}
return fallback
}
@@ -59,7 +59,7 @@ func StartQUICFabricServer(ctx context.Context, cfg QUICFabricServerConfig) (*QU
if len(tlsConfig.NextProtos) == 0 {
tlsConfig.NextProtos = []string{fabricQUICNextProto}
}
listener, err := quic.ListenAddr(cfg.ListenAddr, tlsConfig, cfg.QUICConfig)
listener, err := quic.ListenAddr(cfg.ListenAddr, tlsConfig, defaultQUICFabricConfig(cfg.QUICConfig))
if err != nil {
return nil, err
}
@@ -132,7 +132,7 @@ func (s *QUICFabricServer) handleConn(ctx context.Context, conn *quic.Conn) {
func (s *QUICFabricServer) handleStream(ctx context.Context, conn *quic.Conn, stream *quic.Stream) {
session := fabricproto.NewSession(fabricproto.SessionConfig{})
sender := quicStreamFrameSender{stream: stream}
sender := &quicStreamFrameSender{stream: stream}
defer func() { _ = stream.Close() }()
s.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_quic_stream_opened",
@@ -207,7 +207,7 @@ type quicStreamFrameSender struct {
mu sync.Mutex
}
func (s quicStreamFrameSender) SendFrame(ctx context.Context, frame fabricproto.Frame) error {
func (s *quicStreamFrameSender) SendFrame(ctx context.Context, frame fabricproto.Frame) error {
if s.stream == nil {
return fmt.Errorf("quic fabric stream is closed")
}
@@ -22,6 +22,9 @@ const fabricQUICNextProto = "rap-fabric-data-session-v1"
const fabricQUICReverseHelloPrefix = "rap-fabric-reverse-hello-v1:"
const defaultQUICFabricConnIdleTTL = 5 * time.Minute
const defaultQUICFabricMaxStreamsPerConn = 64
const defaultQUICFabricHandshakeIdleTimeout = 8 * time.Second
const defaultQUICFabricMaxIdleTimeout = 90 * time.Second
const defaultQUICFabricKeepAlivePeriod = 15 * time.Second
const ErrQUICFabricStreamLimitReached = quicFabricError("quic fabric stream limit reached")
type quicFabricError string
@@ -31,20 +34,20 @@ func (e quicFabricError) Error() string {
}
type QUICFabricTransport struct {
Config *quic.Config
LocalPeerID string
IdleTTL time.Duration
MaxStreamsPerConn int
DialAddr func(context.Context, string, *tls.Config, *quic.Config) (*quic.Conn, error)
mu sync.Mutex
conns map[string]*quicFabricConnEntry
reverseConns map[string]*quicFabricConnEntry
inboundProductionHandler func(context.Context, ProductionEnvelope) (ProductionForwardResult, error)
inboundWebIngressHandler func(context.Context, []byte) ([]byte, error)
Config *quic.Config
LocalPeerID string
IdleTTL time.Duration
MaxStreamsPerConn int
DialAddr func(context.Context, string, *tls.Config, *quic.Config) (*quic.Conn, error)
mu sync.Mutex
conns map[string]*quicFabricConnEntry
reverseConns map[string]*quicFabricConnEntry
inboundProductionHandler func(context.Context, ProductionEnvelope) (ProductionForwardResult, error)
inboundWebIngressHandler func(context.Context, []byte) ([]byte, error)
inboundFabricControlHandler func(context.Context, []byte) ([]byte, error)
inboundSyntheticHandler func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error)
logger FabricSessionEventLogger
stats QUICFabricTransportStats
inboundSyntheticHandler func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error)
logger FabricSessionEventLogger
stats QUICFabricTransportStats
}
type QUICFabricTransportStats struct {
@@ -109,7 +112,25 @@ type quicFabricConnEntry struct {
}
func NewQUICFabricTransport(config *quic.Config) *QUICFabricTransport {
return &QUICFabricTransport{Config: config, IdleTTL: defaultQUICFabricConnIdleTTL, MaxStreamsPerConn: defaultQUICFabricMaxStreamsPerConn, conns: map[string]*quicFabricConnEntry{}, reverseConns: map[string]*quicFabricConnEntry{}}
return &QUICFabricTransport{Config: defaultQUICFabricConfig(config), IdleTTL: defaultQUICFabricConnIdleTTL, MaxStreamsPerConn: defaultQUICFabricMaxStreamsPerConn, conns: map[string]*quicFabricConnEntry{}, reverseConns: map[string]*quicFabricConnEntry{}}
}
func defaultQUICFabricConfig(config *quic.Config) *quic.Config {
out := &quic.Config{}
if config != nil {
clone := *config
out = &clone
}
if out.HandshakeIdleTimeout <= 0 {
out.HandshakeIdleTimeout = defaultQUICFabricHandshakeIdleTimeout
}
if out.MaxIdleTimeout <= 0 {
out.MaxIdleTimeout = defaultQUICFabricMaxIdleTimeout
}
if out.KeepAlivePeriod <= 0 {
out.KeepAlivePeriod = defaultQUICFabricKeepAlivePeriod
}
return out
}
func (t *QUICFabricTransport) SetInboundHandlers(production func(context.Context, ProductionEnvelope) (ProductionForwardResult, error), synthetic func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error), logger FabricSessionEventLogger) {
@@ -150,6 +171,7 @@ func quicTLSConfigForTarget(target FabricTransportTarget) *tls.Config {
expectedFingerprint := normalizeCertSHA256(target.PeerCertSHA256)
config := &tls.Config{NextProtos: []string{fabricQUICNextProto}}
if expectedFingerprint == "" {
config.InsecureSkipVerify = true
return config
}
config.InsecureSkipVerify = true
@@ -198,9 +220,12 @@ func (t *QUICFabricTransport) Connect(ctx context.Context, target FabricTranspor
stream, err := conn.OpenStreamSync(ctx)
if err != nil {
t.releaseStream(connKey)
t.evictConnByKey(connKey, conn)
t.evictConn(target, conn)
if closeConn {
_ = conn.CloseWithError(1, "open stream failed")
} else {
_ = conn.CloseWithError(1, "cached stream open failed")
}
return nil, err
}
@@ -680,8 +705,28 @@ func (t *QUICFabricTransport) evictConn(target FabricTransportTarget, conn *quic
t.mu.Unlock()
}
func (t *QUICFabricTransport) evictConnByKey(key string, conn *quic.Conn) {
if t == nil || key == "" || conn == nil {
return
}
t.mu.Lock()
defer t.mu.Unlock()
if strings.HasPrefix(key, "reverse\x00") {
peerID := strings.TrimPrefix(key, "reverse\x00")
if entry := t.reverseConns[peerID]; entry != nil && entry.conn == conn {
delete(t.reverseConns, peerID)
t.stats.ClosedEvicted++
}
return
}
if entry := t.conns[key]; entry != nil && entry.conn == conn {
delete(t.conns, key)
t.stats.ClosedEvicted++
}
}
func (t *QUICFabricTransport) pruneIdleLocked(now time.Time) {
if t == nil || len(t.conns) == 0 {
if t == nil {
return
}
ttl := t.IdleTTL
@@ -897,7 +942,13 @@ func (s *quicFabricSession) Send(ctx context.Context, frame fabricproto.Frame) e
s.writeMu.Lock()
defer s.writeMu.Unlock()
s.applyWriteDeadline(ctx)
return fabricproto.WriteFrame(s.stream, frame)
if err := fabricproto.WriteFrame(s.stream, frame); err != nil {
if s.transport != nil && s.conn != nil {
s.transport.evictConnByKey(s.connKey, s.conn)
}
return err
}
return nil
}
func (s *quicFabricSession) Frames() <-chan fabricproto.Frame {
@@ -21,7 +21,7 @@ const (
type FabricRoutePlannerConfig struct {
ClusterID string
LocalNodeID string
LocalSegmentID string
LocalityGroupID string
LocalNATGroupID string
DefaultCapacity int
RelayCapacity int
@@ -34,13 +34,13 @@ type FabricRoutePlannerConfig struct {
}
type FabricCandidateMetadata struct {
LocalSegmentID string `json:"local_segment_id,omitempty"`
NATGroupID string `json:"nat_group_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
ViaNodeID string `json:"via_node_id,omitempty"`
STUNServer string `json:"stun_server,omitempty"`
ICEFoundation string `json:"ice_foundation,omitempty"`
LocalityGroupID string `json:"locality_group_id,omitempty"`
NATGroupID string `json:"nat_group_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
ViaNodeID string `json:"via_node_id,omitempty"`
STUNServer string `json:"stun_server,omitempty"`
ICEFoundation string `json:"ice_foundation,omitempty"`
}
func FabricRouteSetForPeerEndpointCandidates(targetNodeID string, candidates []PeerEndpointCandidate, cfg FabricRoutePlannerConfig) FabricRouteSet {
@@ -141,7 +141,7 @@ func fabricRouteModeForPeerEndpointCandidate(candidate PeerEndpointCandidate, me
}
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
if sameLocalSegment(metadata, cfg) || sameNATGroup(metadata, cfg) {
if sameLocalityGroup(metadata, cfg) || sameNATGroup(metadata, cfg) {
return FabricRouteLAN
}
if reachability == FabricCandidateReachabilityRelay || connectivity == FabricConnectivityRelayRequired || strings.TrimSpace(metadata.RelayEndpoint) != "" {
@@ -240,12 +240,12 @@ func candidatePressureCount(endpointID string, cfg FabricRoutePlannerConfig) int
return 0
}
func sameLocalSegment(metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) bool {
localSegment := strings.TrimSpace(cfg.LocalSegmentID)
if localSegment == "" {
func sameLocalityGroup(metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) bool {
localityGroup := strings.TrimSpace(cfg.LocalityGroupID)
if localityGroup == "" {
return false
}
return strings.EqualFold(strings.TrimSpace(metadata.LocalSegmentID), localSegment)
return strings.EqualFold(strings.TrimSpace(metadata.LocalityGroupID), localityGroup)
}
func sameNATGroup(metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) bool {
@@ -7,7 +7,7 @@ import (
)
func TestFabricRouteSetForPeerEndpointCandidatesPrefersLocalLAN(t *testing.T) {
metadata, _ := json.Marshal(FabricCandidateMetadata{LocalSegmentID: "site-a", NATGroupID: "nat-a"})
metadata, _ := json.Marshal(FabricCandidateMetadata{LocalityGroupID: "home-lan", NATGroupID: "nat-a"})
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{
{
EndpointID: "node-b-public",
@@ -31,7 +31,7 @@ func TestFabricRouteSetForPeerEndpointCandidatesPrefersLocalLAN(t *testing.T) {
}, FabricRoutePlannerConfig{
ClusterID: "cluster-1",
LocalNodeID: "node-a",
LocalSegmentID: "site-a",
LocalityGroupID: "home-lan",
DefaultCapacity: 200,
Now: time.Unix(100, 0).UTC(),
})
@@ -172,7 +172,7 @@ func TestFabricRouteSetForPeerEndpointCandidatesRejectsNonQUIC(t *testing.T) {
ConnectivityMode: "direct",
},
{
EndpointID: "node-b-legacy-relay",
EndpointID: "node-b-compat-relay",
NodeID: "node-b",
Transport: "relay",
Address: "quic://node-r:19443",
@@ -180,7 +180,7 @@ func TestFabricRouteSetForPeerEndpointCandidatesRejectsNonQUIC(t *testing.T) {
ConnectivityMode: "relay_required",
},
{
EndpointID: "node-b-legacy-reverse",
EndpointID: "node-b-compat-reverse",
NodeID: "node-b",
Transport: "outbound_reverse",
Address: "quic://node-b:19443",
@@ -4,7 +4,6 @@ import (
"context"
"crypto/tls"
"fmt"
"net/http"
"strings"
"time"
@@ -30,7 +29,6 @@ type FabricTransportTarget struct {
Endpoint string
Transport string
Token string
Header http.Header
TLSConfig *tls.Config
PeerCertSHA256 string
Timeout time.Duration
@@ -11,6 +11,8 @@ const DefaultWarmPeerLimit = 8
type PeerCacheConfig struct {
Local PeerIdentity
LocalityGroupID string
LocalNATGroupID string
PeerEndpoints map[string]string
PeerEndpointCandidates map[string][]PeerEndpointCandidate
PeerEndpointObservations map[string]EndpointCandidateHealthObservation
@@ -59,11 +61,12 @@ type PeerCacheEntry struct {
BestCandidateScore int `json:"best_candidate_score,omitempty"`
BestScoreReasons []string `json:"best_score_reasons,omitempty"`
BestPeerCertSHA256 string `json:"best_peer_cert_sha256,omitempty"`
PublicIngressCount int `json:"public_ingress_count,omitempty"`
EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
RelayQUIC bool `json:"relay_quic"`
}
type peerCacheBuildEntry struct {
@@ -119,6 +122,8 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
scored := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
ChannelClass: SyntheticChannelFabricControl,
PreferredRegion: cfg.PreferredRegion,
LocalityGroupID: cfg.LocalityGroupID,
LocalNATGroupID: cfg.LocalNATGroupID,
Now: now,
MaxVerificationAge: time.Hour,
Observations: cfg.PeerEndpointObservations,
@@ -129,6 +134,7 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
for _, scoredCandidate := range scored {
entry.EndpointCandidates = append(entry.EndpointCandidates, scoredCandidate.Candidate)
}
entry.PublicIngressCount = publicIngressCountFromCandidates(entry.EndpointCandidates)
entry.BestCandidateID = scored[0].Candidate.EndpointID
entry.BestCandidateAddr = scored[0].Candidate.Address
entry.BestTransport = scored[0].Candidate.Transport
@@ -197,9 +203,9 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
entry.RendezvousLeaseID = lease.LeaseID
entry.RelayNodeID = lease.RelayNodeID
entry.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
entry.RelayControl = true
entry.RelayQUIC = true
entry.CandidateCount = maxInt(entry.CandidateCount, 1)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{firstNonEmpty(lease.ConnectivityMode, "relay_required"), "relay_control"})
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{firstNonEmpty(lease.ConnectivityMode, "relay_required"), "relay_quic"})
if useLeaseEndpoint {
if localRelay {
entry.BestTransport = "reverse_quic"
@@ -225,7 +231,7 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
entry.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
}
entry.EndpointCount = maxInt(entry.EndpointCount, 1)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{"relay_control"})
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{"relay_quic"})
}
}
out := make([]peerCacheBuildEntry, 0, len(entries))
@@ -334,13 +340,37 @@ func warmPeerPriority(entry peerCacheBuildEntry) int {
if entry.bestScore > 0 {
score += entry.bestScore
}
if entry.RelayControl {
if entry.RelayQUIC {
score += 300
}
if entry.PublicIngressCount > 0 {
score += entry.PublicIngressCount * 75
}
score += entry.CandidateCount
return score
}
func publicIngressCountFromCandidates(candidates []PeerEndpointCandidate) int {
if len(candidates) == 0 {
return 0
}
distinct := map[string]struct{}{}
for _, candidate := range candidates {
if strings.ToLower(strings.TrimSpace(candidate.Reachability)) != "public" {
continue
}
if !strings.Contains(strings.ToLower(strings.TrimSpace(candidate.Transport)), "quic") {
continue
}
address := strings.TrimSpace(candidate.Address)
if address == "" {
continue
}
distinct[address] = struct{}{}
}
return len(distinct)
}
func warmPeerReason(entry peerCacheBuildEntry) string {
if entry.adjacentRoutePeer {
return "route_adjacent"
@@ -348,7 +378,7 @@ func warmPeerReason(entry peerCacheBuildEntry) string {
if entry.RecoverySeed {
return "recovery_seed"
}
if entry.RelayControl {
if entry.RelayQUIC {
return "rendezvous_lease"
}
if entry.BestCandidateID != "" {
@@ -98,6 +98,9 @@ func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) {
if entry.BestCandidateID != "node-b-public" || !entry.Warm {
t.Fatalf("unexpected candidate selection: %+v", entry)
}
if entry.PublicIngressCount != 1 {
t.Fatalf("public ingress count = %d, want 1", entry.PublicIngressCount)
}
}
func TestPeerCacheAppliesEndpointHealthObservations(t *testing.T) {
@@ -224,3 +227,12 @@ func peerCacheEntryByID(snapshot PeerCacheSnapshot, nodeID string) (PeerCacheEnt
}
return PeerCacheEntry{}, false
}
func containsString(values []string, want string) bool {
for _, value := range values {
if value == want {
return true
}
}
return false
}
@@ -21,7 +21,7 @@ const (
PeerTransportModeCorporateLAN = "corporate_lan"
PeerTransportModeOutboundOnly = "outbound_only"
PeerTransportModeRelayRequired = "relay_required"
PeerTransportModeRelayControl = "relay_control"
PeerTransportModeRelayQUIC = "relay_quic"
PeerTransportModeUnknown = "unknown"
)
@@ -44,7 +44,7 @@ type PeerConnectionIntentPlan struct {
CorporateLANCount int `json:"corporate_lan_count"`
OutboundOnlyCount int `json:"outbound_only_count"`
RelayRequiredCount int `json:"relay_required_count"`
RelayControlCount int `json:"relay_control_count"`
RelayQUICCount int `json:"relay_quic_count"`
RendezvousRequiredCount int `json:"rendezvous_required_count"`
RendezvousResolvedCount int `json:"rendezvous_resolved_count"`
RendezvousLeaseCount int `json:"rendezvous_lease_count"`
@@ -113,8 +113,8 @@ func PlanPeerConnectionIntents(cfg PeerConnectionIntentPlanConfig) PeerConnectio
RendezvousLeaseID: entry.RendezvousLeaseID,
RelayNodeID: entry.RelayNodeID,
RelayEndpoint: entry.RelayEndpoint,
RelayCandidate: entry.RelayControl,
ControlPlaneOnly: entry.RelayControl,
RelayCandidate: entry.RelayQUIC,
ControlPlaneOnly: entry.RelayQUIC,
RecoverySeed: candidate.RecoverySeed || entry.RecoverySeed,
Priority: candidate.Priority,
GeneratedAt: now,
@@ -163,8 +163,8 @@ func PlanPeerConnectionIntents(cfg PeerConnectionIntentPlanConfig) PeerConnectio
plan.OutboundOnlyCount++
case PeerTransportModeRelayRequired:
plan.RelayRequiredCount++
case PeerTransportModeRelayControl:
plan.RelayControlCount++
case PeerTransportModeRelayQUIC:
plan.RelayQUICCount++
}
if intent.RequiresRendezvous {
plan.RendezvousRequiredCount++
@@ -266,7 +266,7 @@ func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLeas
} else {
intent.Transport = firstNonEmpty(lease.Transport, "relay_quic")
}
intent.TransportMode = PeerTransportModeRelayControl
intent.TransportMode = PeerTransportModeRelayQUIC
intent.RequiresRendezvous = false
intent.RendezvousResolved = true
intent.DirectCandidate = false
@@ -170,11 +170,11 @@ func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) {
Now: now,
})
if plan.IntentCount != 1 || plan.RelayControlCount != 1 || plan.RendezvousResolvedCount != 1 || plan.RendezvousRequiredCount != 0 {
if plan.IntentCount != 1 || plan.RelayQUICCount != 1 || plan.RendezvousResolvedCount != 1 || plan.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected relay-control plan counts: %+v", plan)
}
intent := plan.Intents[0]
if intent.TransportMode != PeerTransportModeRelayControl ||
if intent.TransportMode != PeerTransportModeRelayQUIC ||
intent.Endpoint != "quic://node-r:19443" ||
intent.RelayNodeID != "node-r" ||
intent.RendezvousLeaseID != "lease-node-b-via-node-r" ||
@@ -239,7 +239,7 @@ func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T
Now: now,
})
if plan.RendezvousResolvedCount != 1 || plan.RelayControlCount != 1 || plan.RendezvousRequiredCount != 0 {
if plan.RendezvousResolvedCount != 1 || plan.RelayQUICCount != 1 || plan.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected reselected plan counts: %+v", plan)
}
intent := plan.Intents[0]
@@ -3,7 +3,6 @@ package mesh
import (
"context"
"fmt"
"net/http"
"strings"
"sync"
"time"
@@ -25,7 +24,6 @@ type PeerConnectionManagerConfig struct {
PeerCache *PeerCache
Tracker *PeerConnectionTracker
RendezvousLeases []PeerRendezvousLease
HTTPClient *http.Client
QUICTransport *QUICFabricTransport
PreferredRegion string
ProbeTimeout time.Duration
@@ -37,7 +35,6 @@ type PeerConnectionManager struct {
peerCache *PeerCache
tracker *PeerConnectionTracker
rendezvousLeases []PeerRendezvousLease
httpClient *http.Client
quicTransport *QUICFabricTransport
preferredRegion string
probeTimeout time.Duration
@@ -60,7 +57,7 @@ type PeerConnectionManagerCycle struct {
Skipped int `json:"skipped"`
RendezvousRequiredCount int `json:"rendezvous_required_count"`
RendezvousResolvedCount int `json:"rendezvous_resolved_count"`
RelayControlCount int `json:"relay_control_count"`
RelayQUICCount int `json:"relay_quic_count"`
RecoveryPlan PeerRecoveryPlan `json:"recovery_plan"`
IntentPlan PeerConnectionIntentPlan `json:"intent_plan"`
Results []PeerConnectionProbeResult `json:"results,omitempty"`
@@ -117,17 +114,6 @@ func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionMa
if probeTimeout <= 0 {
probeTimeout = DefaultPeerConnectionProbeTimeout
}
httpClient := cfg.HTTPClient
if httpClient == nil {
httpClient = &http.Client{
Transport: &http.Transport{
MaxIdleConns: 64,
MaxIdleConnsPerHost: 8,
IdleConnTimeout: 90 * time.Second,
},
Timeout: probeTimeout + time.Second,
}
}
now := cfg.Now
if now == nil {
now = func() time.Time { return time.Now().UTC() }
@@ -137,7 +123,6 @@ func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionMa
peerCache: cfg.PeerCache,
tracker: cfg.Tracker,
rendezvousLeases: append([]PeerRendezvousLease{}, cfg.RendezvousLeases...),
httpClient: httpClient,
quicTransport: cfg.QUICTransport,
preferredRegion: strings.TrimSpace(cfg.PreferredRegion),
probeTimeout: probeTimeout,
@@ -157,6 +142,7 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
Connections: m.tracker.Snapshot(),
TargetReadyPeers: DefaultStablePeerTarget,
MaxProbeCandidates: DefaultRecoveryProbeLimit,
PreferredRegion: m.preferredRegion,
Now: startedAt,
})
intentPlan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
@@ -177,7 +163,7 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
IntentCount: intentPlan.IntentCount,
RendezvousRequiredCount: intentPlan.RendezvousRequiredCount,
RendezvousResolvedCount: intentPlan.RendezvousResolvedCount,
RelayControlCount: intentPlan.RelayControlCount,
RelayQUICCount: intentPlan.RelayQUICCount,
RecoveryPlan: recoveryPlan,
IntentPlan: intentPlan,
Results: make([]PeerConnectionProbeResult, 0, len(intentPlan.Intents)),
@@ -270,7 +256,7 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
RendezvousLeaseID: intent.RendezvousLeaseID,
RelayNodeID: intent.RelayNodeID,
RelayEndpoint: intent.RelayEndpoint,
RelayControl: intent.RelayCandidate,
RelayQUIC: intent.RelayCandidate,
BestPeerCertSHA256: firstNonEmpty(intent.BestPeerCertSHA256, cacheEntry.BestPeerCertSHA256),
}
if intent.RequiresRendezvous {
@@ -385,7 +371,7 @@ func peerConnectionProbeTargetNodeID(intent PeerConnectionIntent, localNodeID st
func (m *PeerConnectionManager) probePeerTarget(ctx context.Context, probePeer PeerCacheEntry, target PeerIdentity) error {
endpoint := strings.TrimRight(strings.TrimSpace(probePeer.Endpoint), "/")
transport := strings.TrimSpace(probePeer.BestTransport)
if hasLegacyEndpointScheme(endpoint) {
if hasUnsupportedEndpointScheme(endpoint) {
return fmt.Errorf("non_quic_probe_rejected")
}
if peerConnectionTargetIsQUIC(transport, endpoint) {
@@ -445,7 +431,7 @@ func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCach
}
add(candidate.EndpointID, candidate.Address, candidate.Transport, candidatePeerCertSHA256(candidate))
}
add(intent.BestCandidateID, intent.Endpoint, intent.Transport, cacheEntry.BestPeerCertSHA256)
add(intent.BestCandidateID, intent.Endpoint, intent.Transport, intent.BestPeerCertSHA256)
return out
}
@@ -455,7 +441,7 @@ func peerConnectionShouldProbeDirectUpgrade(intent PeerConnectionIntent, cacheEn
}
if strings.TrimSpace(intent.ConnectionState) != PeerConnectionRelayReady &&
!intent.RelayCandidate &&
strings.TrimSpace(intent.TransportMode) != PeerTransportModeRelayControl {
strings.TrimSpace(intent.TransportMode) != PeerTransportModeRelayQUIC {
return false
}
for _, candidate := range cacheEntry.EndpointCandidates {
@@ -509,8 +495,3 @@ func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionSta
}
return PeerConnectionState{NodeID: nodeID, State: PeerConnectionDisconnected}
}
func (c Client) withHTTPClient(httpClient *http.Client) Client {
c.HTTPClient = httpClient
return c
}
@@ -3,7 +3,6 @@ package mesh
import (
"context"
"encoding/json"
"net/http"
"testing"
"time"
)
@@ -90,7 +89,7 @@ func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testin
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpoints: map[string]string{
"node-b": "http://127.0.0.1:1",
"node-b": "quic://127.0.0.1:1",
},
WarmPeerLimit: 1,
Now: now,
@@ -100,7 +99,6 @@ func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testin
Local: local,
PeerCache: cache,
Tracker: tracker,
HTTPClient: &http.Client{Timeout: 20 * time.Millisecond},
ProbeTimeout: 20 * time.Millisecond,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
@@ -121,7 +119,7 @@ func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testin
}
}
func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
func TestPeerConnectionManagerProbesRelayQUICLease(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
current := now
tlsConfig := testQUICTLSConfig(t)
@@ -188,7 +186,7 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
if cycle.Attempted != 1 ||
cycle.Succeeded != 1 ||
cycle.Deferred != 0 ||
cycle.RelayControlCount != 1 ||
cycle.RelayQUICCount != 1 ||
cycle.RendezvousResolvedCount != 1 ||
cycle.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected relay-control cycle: %+v", cycle)
@@ -227,11 +225,11 @@ func TestPeerConnectionProbeTargetsFallsBackToBestPeerCertSHA256(t *testing.T) {
BestPeerCertSHA256: "intent-cert",
}
cacheEntry := PeerCacheEntry{
NodeID: "node-b",
BestPeerCertSHA256: "cache-cert",
BestCandidateID: "node-b-best",
BestTransport: "direct_quic",
Endpoint: "quic://94.141.118.222:19199",
NodeID: "node-b",
BestPeerCertSHA256: "cache-cert",
BestCandidateID: "node-b-best",
BestTransport: "direct_quic",
Endpoint: "quic://94.141.118.222:19199",
EndpointCandidates: []PeerEndpointCandidate{
{
EndpointID: "node-b-public",
@@ -259,6 +257,49 @@ func TestPeerConnectionProbeTargetsFallsBackToBestPeerCertSHA256(t *testing.T) {
}
}
func TestPeerConnectionProbeTargetsUsesRelayLeaseCertForRelayEndpoint(t *testing.T) {
intent := PeerConnectionIntent{
NodeID: "node-b",
BestCandidateID: "lease-node-b-via-node-r",
Endpoint: "quic://195.123.240.88:19131",
Transport: "relay_quic",
BestPeerCertSHA256: "relay-cert",
RelayCandidate: true,
ConnectionState: PeerConnectionBackoff,
}
cacheEntry := PeerCacheEntry{
NodeID: "node-b",
BestPeerCertSHA256: "direct-cert",
EndpointCandidates: []PeerEndpointCandidate{
{
EndpointID: "node-b-private",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://192.168.200.61:19132",
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 1,
Metadata: peerConnectionProbeMetadata(t, "direct-cert"),
},
},
}
targets := peerConnectionProbeTargets(intent, cacheEntry)
if len(targets) != 2 {
t.Fatalf("target count = %d, want 2", len(targets))
}
for _, target := range targets {
if target.Endpoint != "quic://195.123.240.88:19131" {
continue
}
if target.PeerCertSHA256 != "relay-cert" {
t.Fatalf("relay endpoint cert = %q, want relay-cert", target.PeerCertSHA256)
}
return
}
t.Fatalf("relay endpoint target not found: %+v", targets)
}
func TestPeerConnectionProbeTargetsUpgradeRelayReadyPeerToDirectQUIC(t *testing.T) {
now := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
current := now
@@ -36,7 +36,7 @@ type PeerConnectionState struct {
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
RelayQUIC bool `json:"relay_quic"`
ConsecutiveSuccesses int `json:"consecutive_successes"`
ConsecutiveFailures int `json:"consecutive_failures"`
LastLatencyMs int `json:"last_latency_ms,omitempty"`
@@ -287,7 +287,7 @@ func (t *PeerConnectionTracker) entry(peer PeerCacheEntry, now time.Time) PeerCo
entry.RendezvousLeaseID = peer.RendezvousLeaseID
entry.RelayNodeID = peer.RelayNodeID
entry.RelayEndpoint = peer.RelayEndpoint
entry.RelayControl = peer.RelayControl
entry.RelayQUIC = peer.RelayQUIC
return entry
}
@@ -21,6 +21,7 @@ type PeerRecoveryPlanConfig struct {
Connections PeerConnectionSnapshot
TargetReadyPeers int
MaxProbeCandidates int
PreferredRegion string
Now time.Time
}
@@ -42,6 +43,7 @@ type PeerRecoveryPlan struct {
type PeerRecoveryCandidate struct {
NodeID string `json:"node_id"`
Endpoint string `json:"endpoint,omitempty"`
Region string `json:"region,omitempty"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
@@ -57,6 +59,7 @@ type PeerRecoveryCandidate struct {
type peerRecoveryCandidateBuild struct {
PeerRecoveryCandidate
PublicIngressCount int
}
func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
@@ -96,6 +99,7 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
ready := 0
degraded := 0
backoff := 0
readyExternalRegions := map[string]struct{}{}
for nodeID, connection := range connectionByNode {
entry, ok := entryByNode[nodeID]
if !ok || strings.TrimSpace(entry.Endpoint) == "" {
@@ -104,6 +108,10 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
switch connection.State {
case PeerConnectionReady:
ready++
region := strings.TrimSpace(entry.BestRegion)
if region != "" && (strings.TrimSpace(cfg.PreferredRegion) == "" || !strings.EqualFold(region, cfg.PreferredRegion)) {
readyExternalRegions[strings.ToLower(region)] = struct{}{}
}
case PeerConnectionRelayReady:
// Relay-ready peers remain valuable for control-plane reachability,
// but they do not satisfy the target for direct-ready transport paths.
@@ -125,6 +133,7 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
if mode == PeerRecoveryModeSteady {
limit = target
}
missingExternalRegions := missingPeerRecoveryExternalRegions(cfg.PeerCache, cfg.PreferredRegion, readyExternalRegions, target)
candidates := make([]peerRecoveryCandidateBuild, 0, len(cfg.PeerCache.Entries))
for _, entry := range cfg.PeerCache.Entries {
@@ -138,13 +147,14 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
if connection.State == PeerConnectionBackoff && connection.BackoffUntil.After(now) {
continue
}
reason, ok := peerRecoveryCandidateReason(mode, entry, connection)
reason, ok := peerRecoveryCandidateReason(mode, entry, connection, missingExternalRegions, cfg.PreferredRegion)
if !ok {
continue
}
candidate := PeerRecoveryCandidate{
NodeID: entry.NodeID,
Endpoint: strings.TrimSpace(entry.Endpoint),
Region: strings.TrimSpace(entry.BestRegion),
Warm: entry.Warm,
WarmReason: entry.WarmReason,
RecoverySeed: entry.RecoverySeed,
@@ -155,9 +165,12 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
LastLatencyMs: connection.LastLatencyMs,
BackoffUntil: connection.BackoffUntil,
Reason: reason,
Priority: peerRecoveryCandidatePriority(entry, connection, reason),
Priority: peerRecoveryCandidatePriority(entry, connection, reason, cfg.PreferredRegion),
}
candidates = append(candidates, peerRecoveryCandidateBuild{PeerRecoveryCandidate: candidate})
candidates = append(candidates, peerRecoveryCandidateBuild{
PeerRecoveryCandidate: candidate,
PublicIngressCount: entry.PublicIngressCount,
})
}
sort.SliceStable(candidates, func(i, j int) bool {
if candidates[i].Priority != candidates[j].Priority {
@@ -166,7 +179,7 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
return candidates[i].NodeID < candidates[j].NodeID
})
if len(candidates) > limit {
candidates = candidates[:limit]
candidates = trimPeerRecoveryCandidates(candidates, limit, cfg.PreferredRegion)
}
outCandidates := make([]PeerRecoveryCandidate, 0, len(candidates))
@@ -194,11 +207,143 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
}
}
func peerRecoveryCandidateReason(mode string, entry PeerCacheEntry, connection PeerConnectionState) (string, bool) {
func missingPeerRecoveryExternalRegions(snapshot PeerCacheSnapshot, preferredRegion string, readyExternalRegions map[string]struct{}, target int) map[string]struct{} {
preferredRegion = strings.TrimSpace(preferredRegion)
availableExternalRegions := map[string]struct{}{}
for _, entry := range snapshot.Entries {
region := strings.TrimSpace(entry.BestRegion)
if region == "" {
continue
}
if preferredRegion != "" && strings.EqualFold(region, preferredRegion) {
continue
}
availableExternalRegions[strings.ToLower(region)] = struct{}{}
}
if len(availableExternalRegions) == 0 {
return nil
}
desiredExternal := len(availableExternalRegions)
if desiredExternal > 2 {
desiredExternal = 2
}
if target > 0 && desiredExternal > target {
desiredExternal = target
}
if len(readyExternalRegions) >= desiredExternal {
return nil
}
missing := map[string]struct{}{}
for region := range availableExternalRegions {
if _, ok := readyExternalRegions[region]; ok {
continue
}
missing[region] = struct{}{}
}
if len(missing) == 0 {
return nil
}
return missing
}
func trimPeerRecoveryCandidates(candidates []peerRecoveryCandidateBuild, limit int, preferredRegion string) []peerRecoveryCandidateBuild {
if len(candidates) <= limit || limit <= 0 {
return candidates
}
preferredRegion = strings.TrimSpace(preferredRegion)
externalRegions := map[string]struct{}{}
for _, candidate := range candidates {
region := strings.TrimSpace(candidate.Region)
if region == "" || (preferredRegion != "" && strings.EqualFold(region, preferredRegion)) {
continue
}
externalRegions[strings.ToLower(region)] = struct{}{}
}
if len(externalRegions) < 2 {
return candidates[:limit]
}
selected := make([]peerRecoveryCandidateBuild, 0, limit)
selectedNodeIDs := map[string]struct{}{}
selectedRegions := map[string]struct{}{}
for _, candidate := range candidates {
if len(selected) >= limit {
break
}
region := strings.TrimSpace(candidate.Region)
if region == "" || (preferredRegion != "" && strings.EqualFold(region, preferredRegion)) {
continue
}
regionKey := strings.ToLower(region)
if _, exists := selectedRegions[regionKey]; exists {
continue
}
selected = append(selected, candidate)
selectedNodeIDs[candidate.NodeID] = struct{}{}
selectedRegions[regionKey] = struct{}{}
}
if len(selected) < limit && !selectedHasPublicIngress(selected) {
for _, candidate := range candidates {
if len(selected) >= limit {
break
}
if _, exists := selectedNodeIDs[candidate.NodeID]; exists {
continue
}
if candidatePublicIngressCount(candidate) <= 0 {
continue
}
selected = append(selected, candidate)
selectedNodeIDs[candidate.NodeID] = struct{}{}
break
}
}
for _, candidate := range candidates {
if len(selected) >= limit {
break
}
if _, exists := selectedNodeIDs[candidate.NodeID]; exists {
continue
}
selected = append(selected, candidate)
selectedNodeIDs[candidate.NodeID] = struct{}{}
}
if len(selected) > limit {
selected = selected[:limit]
}
return selected
}
func selectedHasPublicIngress(candidates []peerRecoveryCandidateBuild) bool {
for _, candidate := range candidates {
if candidatePublicIngressCount(candidate) > 0 {
return true
}
}
return false
}
func candidatePublicIngressCount(candidate peerRecoveryCandidateBuild) int {
return candidate.PublicIngressCount
}
func peerRecoveryCandidateReason(mode string, entry PeerCacheEntry, connection PeerConnectionState, missingExternalRegions map[string]struct{}, preferredRegion string) (string, bool) {
if mode == PeerRecoveryModeSteady {
if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady {
return "maintain_ready", true
}
region := strings.ToLower(strings.TrimSpace(entry.BestRegion))
if region != "" && len(missingExternalRegions) > 0 {
if _, ok := missingExternalRegions[region]; ok {
if preferredRegion == "" || !strings.EqualFold(strings.TrimSpace(entry.BestRegion), preferredRegion) {
if connection.State == PeerConnectionDegraded {
return "recover_external_area", true
}
if entry.Warm || entry.RecoverySeed || connection.State == PeerConnectionDisconnected || connection.State == PeerConnectionConnecting {
return "recover_external_area", true
}
}
}
}
return "", false
}
if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady {
@@ -216,7 +361,7 @@ func peerRecoveryCandidateReason(mode string, entry PeerCacheEntry, connection P
return "recover_peer", true
}
func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnectionState, reason string) int {
func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnectionState, reason string, preferredRegion string) int {
score := 0
if entry.Warm {
score += 1000
@@ -237,6 +382,17 @@ func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnecti
if entry.BestCandidateID != "" {
score += 150
}
if entry.PublicIngressCount > 0 {
score += entry.PublicIngressCount * 90
}
preferredRegion = strings.TrimSpace(preferredRegion)
entryRegion := strings.TrimSpace(entry.BestRegion)
switch {
case preferredRegion != "" && entryRegion != "" && !strings.EqualFold(entryRegion, preferredRegion):
score += 275
case preferredRegion != "" && entryRegion != "" && strings.EqualFold(entryRegion, preferredRegion):
score += 25
}
score += entry.BestCandidateScore / 10
switch connection.State {
case PeerConnectionReady, PeerConnectionRelayReady:
@@ -251,6 +407,8 @@ func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnecti
switch reason {
case "maintain_ready":
score += 500
case "recover_external_area":
score += 450
case "recover_degraded":
score += 300
case "recover_seed":
@@ -82,7 +82,7 @@ func TestPeerRecoveryPlanTreatsRelayReadyPeersAsRecoveryGap(t *testing.T) {
RendezvousLeaseID: "lease-1",
RelayNodeID: "node-r",
RelayEndpoint: "quic://relay:19443",
RelayControl: true,
RelayQUIC: true,
},
},
},
@@ -121,6 +121,129 @@ func TestPeerRecoveryPlanCapsTargetByConnectablePeers(t *testing.T) {
}
}
func TestPeerRecoveryPlanPrefersExternalRegionsWhenTrimmingReadyPeers(t *testing.T) {
now := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-home-a", Endpoint: "quic://node-home-a:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-home-b", Endpoint: "quic://node-home-b:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-usa", Endpoint: "quic://node-usa:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "usa"},
{NodeID: "node-ifcm", Endpoint: "quic://node-ifcm:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "ifcm"},
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-home-a", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-home-b", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-usa", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-ifcm", State: PeerConnectionReady, LastLatencyMs: 20},
}},
PreferredRegion: "home",
Now: now,
})
if len(plan.Candidates) != DefaultStablePeerTarget {
t.Fatalf("candidate count = %d, want %d", len(plan.Candidates), DefaultStablePeerTarget)
}
if !recoveryPlanHasCandidate(plan, "node-usa", "maintain_ready") || !recoveryPlanHasCandidate(plan, "node-ifcm", "maintain_ready") {
t.Fatalf("expected external-region peers to be retained: %+v", plan.Candidates)
}
}
func TestPeerRecoveryPlanPrefersPublicIngressAtSameRegion(t *testing.T) {
now := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-home-private-a", Endpoint: "quic://10.0.0.2:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-home-private-b", Endpoint: "quic://10.0.0.3:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-home-public", Endpoint: "quic://94.141.118.222:19199", Warm: true, WarmReason: "route_adjacent", BestRegion: "home", PublicIngressCount: 1},
{NodeID: "node-usa", Endpoint: "quic://195.123.240.88:19131", Warm: true, WarmReason: "route_adjacent", BestRegion: "usa", PublicIngressCount: 1},
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-home-private-a", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-home-private-b", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-home-public", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-usa", State: PeerConnectionReady, LastLatencyMs: 20},
}},
PreferredRegion: "home",
Now: now,
})
if len(plan.Candidates) != DefaultStablePeerTarget {
t.Fatalf("candidate count = %d, want %d", len(plan.Candidates), DefaultStablePeerTarget)
}
if !recoveryPlanHasCandidate(plan, "node-home-public", "maintain_ready") {
t.Fatalf("expected public-ingress home peer to be retained: %+v", plan.Candidates)
}
}
func TestPeerRecoveryPlanRetainsDistinctExternalRegionsWhenAvailable(t *testing.T) {
now := time.Date(2026, 5, 19, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-home-a", Endpoint: "quic://node-home-a:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-home-b", Endpoint: "quic://node-home-b:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-home-c", Endpoint: "quic://node-home-c:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-usa-a", Endpoint: "quic://node-usa-a:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "usa", PublicIngressCount: 1},
{NodeID: "node-usa-b", Endpoint: "quic://node-usa-b:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "usa", PublicIngressCount: 1},
{NodeID: "node-ifcm", Endpoint: "quic://node-ifcm:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "ifcm", PublicIngressCount: 1},
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-home-a", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-home-b", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-home-c", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-usa-a", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-usa-b", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-ifcm", State: PeerConnectionReady, LastLatencyMs: 20},
}},
PreferredRegion: "home",
Now: now,
})
if len(plan.Candidates) != DefaultStablePeerTarget {
t.Fatalf("candidate count = %d, want %d", len(plan.Candidates), DefaultStablePeerTarget)
}
if !recoveryPlanHasCandidate(plan, "node-usa-a", "maintain_ready") && !recoveryPlanHasCandidate(plan, "node-usa-b", "maintain_ready") {
t.Fatalf("expected at least one usa candidate to be retained: %+v", plan.Candidates)
}
if !recoveryPlanHasCandidate(plan, "node-ifcm", "maintain_ready") {
t.Fatalf("expected ifcm candidate to be retained for area diversity: %+v", plan.Candidates)
}
}
func TestPeerRecoveryPlanSteadyModeAddsMissingExternalAreaCandidate(t *testing.T) {
now := time.Date(2026, 5, 19, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-test-a", Endpoint: "quic://node-test-a:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "test"},
{NodeID: "node-test-b", Endpoint: "quic://node-test-b:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "test"},
{NodeID: "node-usa", Endpoint: "quic://node-usa:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "usa", PublicIngressCount: 1},
{NodeID: "node-home", Endpoint: "quic://node-home:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home", PublicIngressCount: 1},
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-test-a", State: PeerConnectionReady, LastLatencyMs: 10},
{NodeID: "node-test-b", State: PeerConnectionReady, LastLatencyMs: 10},
{NodeID: "node-usa", State: PeerConnectionReady, LastLatencyMs: 10},
{NodeID: "node-home", State: PeerConnectionDegraded, LastLatencyMs: 20},
}},
PreferredRegion: "test",
Now: now,
})
if len(plan.Candidates) != DefaultStablePeerTarget {
t.Fatalf("candidate count = %d, want %d", len(plan.Candidates), DefaultStablePeerTarget)
}
if !recoveryPlanHasCandidate(plan, "node-home", "recover_external_area") {
t.Fatalf("expected missing external area candidate to be retained: %+v", plan.Candidates)
}
}
func recoveryPlanPeer(nodeID string, warm bool, recoverySeed bool, warmReason string) PeerCacheEntry {
return PeerCacheEntry{
NodeID: nodeID,
@@ -280,6 +280,9 @@ func (t *QUICProductionForwardTransport) sendProductionOnSession(ctx context.Con
return fabricproto.Frame{}, 0, ErrForwardPeerUnavailable
}
if err != nil {
if frame, ok := drainProductionResponseFrame(session, sequence); ok {
return frame, time.Since(started).Milliseconds(), nil
}
return fabricproto.Frame{}, 0, err
}
case frame, ok := <-session.Frames():
@@ -294,6 +297,25 @@ func (t *QUICProductionForwardTransport) sendProductionOnSession(ctx context.Con
}
}
func drainProductionResponseFrame(session FabricTransportSession, sequence uint64) (fabricproto.Frame, bool) {
if session == nil {
return fabricproto.Frame{}, false
}
for {
select {
case frame, ok := <-session.Frames():
if !ok {
return fabricproto.Frame{}, false
}
if frame.Type == fabricproto.FrameData && frame.StreamID == ProductionForwardQUICStreamID && frame.Sequence == sequence {
return frame, true
}
default:
return fabricproto.Frame{}, false
}
}
}
func decodeQUICProductionForwardResponse(payload []byte) (ProductionForwardResult, error) {
var response quicProductionForwardResponse
if err := json.Unmarshal(payload, &response); err != nil {
@@ -283,12 +283,28 @@ func (r *FabricRegistry) ResolveService(req FabricRegistryResolveRequest) Fabric
return FabricRegistryResolvedService{Found: false, Reason: "service_required"}
}
scopeOrder := fabricRegistryScopeResolutionOrder(req.Scope, req.OrganizationID)
if resolved := r.resolveServiceFromRecords(req, service, scopeOrder, false); resolved.Found || resolved.Reason == "no_usable_endpoints" {
return resolved
}
if resolved := r.resolveServiceFromRecords(req, service, scopeOrder, true); resolved.Found || resolved.Reason == "no_usable_endpoints" {
return resolved
}
return FabricRegistryResolvedService{Found: false, Service: service, Reason: "no_active_record"}
}
func (r *FabricRegistry) resolveServiceFromRecords(req FabricRegistryResolveRequest, service string, scopeOrder []string, candidateOnly bool) FabricRegistryResolvedService {
for _, scope := range scopeOrder {
organizationID := strings.TrimSpace(req.OrganizationID)
if scope != FabricRegistryScopeOrganization {
organizationID = ""
}
record, ok := r.Active(req.ClusterID, service, scope, organizationID, req.Now)
var record FabricRegistryGossipRecord
var ok bool
if candidateOnly {
record, ok = r.Candidate(req.ClusterID, service, scope, organizationID, req.Now)
} else {
record, ok = r.Active(req.ClusterID, service, scope, organizationID, req.Now)
}
if !ok {
continue
}
@@ -306,9 +322,28 @@ func (r *FabricRegistry) ResolveService(req FabricRegistryResolveRequest) Fabric
RecordEpoch: record.Epoch,
RecordHash: hex.EncodeToString(sum[:]),
Endpoints: endpoints,
Reason: fabricRegistryResolveReason(candidateOnly),
}
}
return FabricRegistryResolvedService{Found: false, Service: service, Reason: "no_active_record"}
return FabricRegistryResolvedService{Found: false, Service: service}
}
func (r *FabricRegistry) Candidate(clusterID, service, scope, organizationID string, now time.Time) (FabricRegistryGossipRecord, bool) {
if r == nil {
return FabricRegistryGossipRecord{}, false
}
entry, ok := r.candidates[fabricRegistryKey(clusterID, service, scope, organizationID)]
if !ok || entry.State != FabricRegistryCandidate || !entry.Record.ExpiresAt.After(registryNow(now)) {
return FabricRegistryGossipRecord{}, false
}
return entry.Record, true
}
func fabricRegistryResolveReason(candidateOnly bool) string {
if candidateOnly {
return "candidate_record_pending_live_verification"
}
return ""
}
func (r *FabricRegistry) Snapshot(now time.Time) FabricRegistrySnapshot {
@@ -507,7 +542,7 @@ func validateFabricRegistryGossipRecord(record FabricRegistryGossipRecord, polic
if strings.TrimSpace(endpoint.EndpointID) == "" || strings.TrimSpace(endpoint.Address) == "" || strings.TrimSpace(endpoint.Transport) == "" {
return fmt.Errorf("fabric registry gossip record contains invalid endpoint")
}
if !isQUICOnlyCandidateTransport(endpoint.Transport) || hasLegacyEndpointScheme(endpoint.Address) {
if !isQUICOnlyCandidateTransport(endpoint.Transport) || hasUnsupportedEndpointScheme(endpoint.Address) {
return fmt.Errorf("fabric registry gossip endpoint must be QUIC-only")
}
if len(endpoint.Metadata) > 0 && !json.Valid(endpoint.Metadata) {
@@ -605,7 +640,7 @@ func selectFabricRegistryEndpoints(endpoints []FabricRegistryEndpoint, preferred
preferredRegion = strings.TrimSpace(preferredRegion)
out := make([]FabricRegistryEndpoint, 0, len(endpoints))
for _, endpoint := range endpoints {
if strings.TrimSpace(endpoint.Address) == "" || !isQUICOnlyCandidateTransport(endpoint.Transport) || hasLegacyEndpointScheme(endpoint.Address) {
if strings.TrimSpace(endpoint.Address) == "" || !isQUICOnlyCandidateTransport(endpoint.Transport) || hasUnsupportedEndpointScheme(endpoint.Address) {
continue
}
out = append(out, endpoint)
@@ -636,16 +671,10 @@ func probeFabricRegistryEndpoint(ctx context.Context, transport FabricTransport,
if timeout <= 0 {
timeout = 2 * time.Second
}
target := FabricTransportTarget{
EndpointID: endpoint.EndpointID,
PeerID: endpoint.EndpointID,
Endpoint: endpoint.Address,
Transport: endpoint.Transport,
PeerCertSHA256: endpoint.PeerCertSHA256,
Timeout: timeout,
InboundBuffer: 2,
ErrorBuffer: 2,
}
target := FabricTransportTargetFromRegistryEndpoint(endpoint)
target.Timeout = timeout
target.InboundBuffer = 2
target.ErrorBuffer = 2
startedAt := time.Now()
session, err := transport.Connect(ctx, target)
if err != nil {
@@ -45,7 +45,7 @@ func TestFabricRegistryGossipRecordRequiresTrustedSignature(t *testing.T) {
}
}
func TestFabricRegistryRejectsLegacyEndpointAndExpiredRecord(t *testing.T) {
func TestFabricRegistryRejectsDisallowedEndpointAndExpiredRecord(t *testing.T) {
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
@@ -65,7 +65,7 @@ func TestFabricRegistryRejectsLegacyEndpointAndExpiredRecord(t *testing.T) {
},
Now: now,
}); err == nil {
t.Fatal("legacy HTTP endpoint was accepted")
t.Fatal("compat HTTP endpoint was accepted")
}
expired := testFabricRegistryGossipRecord(now.Add(-2*time.Hour), 11)
expired.ExpiresAt = now.Add(-time.Minute)
@@ -523,7 +523,7 @@ func (s *RemoteWorkspaceFrameProbeSink) AcceptRemoteWorkspaceFrameBatchProbe(_ c
AckedFrames: acceptedFrames,
Backpressure: false,
DropPolicy: "drop_droppable_overflow_ack_accepted",
DeliverySequence: s.sequence,
DeliverySequence: uint64(s.sequence),
DeliveredAt: now.Format(time.RFC3339Nano),
}
s.last = receipt
@@ -695,6 +695,24 @@ func isValidRemoteWorkspaceAdapterSessionID(adapterSessionID string) bool {
return true
}
func isValidRemoteWorkspaceAdapterMailboxConsumerID(consumerID string) bool {
consumerID = strings.TrimSpace(consumerID)
if consumerID == "" || len(consumerID) > 128 {
return false
}
for _, ch := range consumerID {
switch {
case ch >= 'a' && ch <= 'z':
case ch >= 'A' && ch <= 'Z':
case ch >= '0' && ch <= '9':
case ch == '-', ch == '_', ch == '.', ch == ':':
default:
return false
}
}
return true
}
func actionToAdapterSessionState(action string) string {
switch action {
case "expire":
@@ -106,7 +106,7 @@ func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error {
if strings.TrimSpace(nodeID) == "" || strings.TrimSpace(endpoint) == "" {
return fmt.Errorf("scoped synthetic mesh config contains empty peer endpoint")
}
if hasLegacyEndpointScheme(endpoint) {
if hasUnsupportedEndpointScheme(endpoint) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC peer endpoint")
}
}
@@ -124,7 +124,7 @@ func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error {
strings.TrimSpace(candidate.ConnectivityMode) == "" {
return fmt.Errorf("scoped synthetic mesh config contains invalid peer endpoint candidate")
}
if !isQUICOnlyCandidateTransport(candidate.Transport) || hasLegacyEndpointScheme(candidate.Address) {
if !isQUICOnlyCandidateTransport(candidate.Transport) || hasUnsupportedEndpointScheme(candidate.Address) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC peer endpoint candidate")
}
}
@@ -185,12 +185,12 @@ func validatePeerDirectory(entries []PeerDirectoryEntry, localNodeID string) err
return nil
}
func hasLegacyEndpointScheme(endpoint string) bool {
func hasUnsupportedEndpointScheme(endpoint string) bool {
endpoint = strings.ToLower(strings.TrimSpace(endpoint))
return strings.HasPrefix(endpoint, "http://") ||
strings.HasPrefix(endpoint, "https://") ||
strings.HasPrefix(endpoint, "ws://") ||
strings.HasPrefix(endpoint, "wss://")
if endpoint == "" || !strings.Contains(endpoint, "://") {
return false
}
return !strings.HasPrefix(endpoint, "quic://")
}
func validateRecoverySeeds(seeds []PeerRecoverySeed) error {
@@ -205,7 +205,7 @@ func validateRecoverySeeds(seeds []PeerRecoverySeed) error {
strings.TrimSpace(seed.Transport) == "" {
return fmt.Errorf("scoped synthetic mesh config contains invalid recovery seed")
}
if !isQUICOnlyCandidateTransport(seed.Transport) || hasLegacyEndpointScheme(seed.Endpoint) {
if !isQUICOnlyCandidateTransport(seed.Transport) || hasUnsupportedEndpointScheme(seed.Endpoint) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC recovery seed")
}
if _, duplicate := seen[key]; duplicate {
@@ -241,7 +241,7 @@ func validateRendezvousLeases(leases []PeerRendezvousLease, routes []SyntheticRo
(len(lease.Metadata) > 0 && !json.Valid(lease.Metadata)) {
return fmt.Errorf("scoped synthetic mesh config contains invalid rendezvous lease")
}
if !isQUICOnlyCandidateTransport(lease.Transport) || hasLegacyEndpointScheme(lease.RelayEndpoint) {
if !isQUICOnlyCandidateTransport(lease.Transport) || hasUnsupportedEndpointScheme(lease.RelayEndpoint) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC rendezvous lease")
}
if _, duplicate := seen[lease.LeaseID]; duplicate {
@@ -174,7 +174,7 @@ func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointCandidate(t *testing
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpoint(t *testing.T) {
func TestLoadScopedSyntheticConfigRejectsDisallowedPeerEndpoint(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
@@ -189,7 +189,7 @@ func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpoint(t *testing.T) {
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpointCandidateTransport(t *testing.T) {
func TestLoadScopedSyntheticConfigRejectsDisallowedPeerEndpointCandidateTransport(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
@@ -215,7 +215,7 @@ func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpointCandidateTransport(t
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpointCandidateScheme(t *testing.T) {
func TestLoadScopedSyntheticConfigRejectsDisallowedPeerEndpointCandidateScheme(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
@@ -295,7 +295,7 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRecoverySeed(t *testing.T) {
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyRecoverySeed(t *testing.T) {
func TestLoadScopedSyntheticConfigRejectsDisallowedRecoverySeed(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
@@ -337,7 +337,7 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) {
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyRendezvousLease(t *testing.T) {
func TestLoadScopedSyntheticConfigRejectsDisallowedRendezvousLease(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17z12.synthetic.v1",
ClusterID: "cluster-1",
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -12,6 +12,21 @@ import (
type VPNPacketBatchPayload struct {
SchemaVersion string `json:"schema_version"`
VPNConnectionID string `json:"vpn_connection_id"`
TunnelID string `json:"tunnel_id,omitempty"`
PoolID string `json:"pool_id,omitempty"`
ServiceID string `json:"service_id,omitempty"`
LocalServiceID string `json:"local_service_id,omitempty"`
RemoteServiceID string `json:"remote_service_id,omitempty"`
ServiceKind string `json:"service_kind,omitempty"`
ServiceClass string `json:"service_class,omitempty"`
ServiceRole string `json:"service_role,omitempty"`
RouteLeaseID string `json:"route_lease_id,omitempty"`
RouteGeneration string `json:"route_generation,omitempty"`
DataPlane string `json:"data_plane,omitempty"`
TransportOwner string `json:"transport_owner,omitempty"`
RouteVisibility string `json:"route_visibility,omitempty"`
TrafficClasses []string `json:"traffic_classes,omitempty"`
StreamShards int `json:"stream_shards,omitempty"`
Direction string `json:"direction"`
Packets [][]byte `json:"packets"`
SentAt time.Time `json:"sent_at"`
@@ -70,7 +70,7 @@ func (s StubSupervisor) applyOne(workload client.DesiredWorkload) client.Workloa
StatusPayload: payload,
}
}
if serviceType == "core-mesh" || serviceType == "mesh-listener" {
if serviceType == "core-mesh" || serviceType == "fabric-listener" {
payload["reason"] = "builtin_node_agent_service_ready"
payload["execution_mode"] = "builtin"
payload["traffic"] = serviceTrafficMode(serviceType)
@@ -143,7 +143,7 @@ func (s StubSupervisor) applyOne(workload client.DesiredWorkload) client.Workloa
StatusPayload: payload,
}
}
if (serviceType == "vpn-exit" || serviceType == "ipv4-egress" || serviceType == "vpn-client") && runtimeMode == "native" {
if (serviceType == "vpn-exit" || serviceType == "ipv4-egress" || serviceType == "vpn-client" || serviceType == "ipv4-ingress") && runtimeMode == "native" {
for key, value := range vpnFabricOnlyContract(serviceType, workload.Config) {
payload[key] = value
}
@@ -151,7 +151,7 @@ func (s StubSupervisor) applyOne(workload client.DesiredWorkload) client.Workloa
payload["fabric_transport"] = "quic_only"
payload["fabric_service_channel_required"] = true
payload["backend_relay_fallback"] = false
payload["legacy_protocol_compatibility"] = false
payload["compat_protocol_compatibility"] = false
payload["traffic"] = "fabric_service_channel_only"
return client.WorkloadStatusRequest{
ReportedState: "running",
@@ -202,8 +202,8 @@ func (s StubSupervisor) applyOne(workload client.DesiredWorkload) client.Workloa
}
func vpnFabricOnlyContract(serviceType string, config map[string]any) map[string]any {
role := "vpn-client"
reason := "vpn_client_node_contract_ready"
role := "ipv4-ingress"
reason := "ipv4_ingress_node_contract_ready"
serviceClass := "vpn_packets"
internetEgress := false
if serviceType == "vpn-exit" || serviceType == "ipv4-egress" {
@@ -222,7 +222,12 @@ func vpnFabricOnlyContract(serviceType string, config map[string]any) map[string
"allowed_cidrs": stringSliceConfig(config, "allowed_cidrs"),
"dns_servers": stringSliceConfig(config, "dns_servers"),
"client_policy_source": stringConfig(config, "client_policy_source", "fabric_access_policy"),
"android_node_supported": serviceType == "vpn-client",
"legacy_role_alias": "vpn-client",
"node_core": "same_fabric_core_all_platforms",
"platform_adapter_scope": "local_packet_io_only",
"android_node_supported": serviceType == "vpn-client" || serviceType == "ipv4-ingress",
"linux_node_supported": serviceType == "vpn-client" || serviceType == "ipv4-ingress",
"windows_node_supported": serviceType == "vpn-client" || serviceType == "ipv4-ingress",
"ipv4_exit_supported": internetEgress,
"fabric_service_channel_required": true,
"packet_runtime_status": "fabric_channel_binding_pending_runtime",
@@ -237,7 +242,7 @@ func vpnServiceBindingContract(serviceType string, config map[string]any) map[st
"type": "ipv4_egress",
"accepts_service_class": "vpn_packets",
"accepts_from_fabric_only": true,
"legacy_protocol_listener": false,
"compat_protocol_listener": false,
"exit_pool_id": stringConfig(config, "pool_id", ""),
"region": stringConfig(config, "region", ""),
"allowed_cidrs": stringSliceConfig(config, "allowed_cidrs"),
@@ -248,7 +253,7 @@ func vpnServiceBindingContract(serviceType string, config map[string]any) map[st
}
return map[string]any{
"type": "local_ipv4_ingress",
"accepts_from": []string{"android_vpnservice_tun", "linux_tun", "host_service_port"},
"accepts_from": []string{"android_vpnservice_tun", "linux_tun", "windows_wintun", "host_service_port"},
"service_class": "vpn_packets",
"exit_selection": "pool",
"preferred_exit_pool_id": stringConfig(config, "exit_pool_id", ""),
@@ -256,8 +261,10 @@ func vpnServiceBindingContract(serviceType string, config map[string]any) map[st
"listen_udp_ports": intSliceConfig(config, "listen_udp_ports"),
"tun_required": true,
"route_authority": "fabric_farm",
"legacy_protocol_listener": false,
"compat_protocol_listener": false,
"requires_fabric_node_runtime": true,
"traffic_visibility": "opaque_ipv4_packets",
"flow_distribution": "opaque_packet_hash_shards",
}
}
@@ -266,12 +273,10 @@ func webIngressListenerConfig(serviceType string, config map[string]any) webingr
RuntimeConfig: webingress.RuntimeConfig{
ServiceType: serviceType,
Scope: stringConfig(config, "scope", ""),
ServiceClasses: stringSliceConfig(config, "service_classes"),
ServiceClasses: webIngressServiceClasses(serviceType, config),
TLSMode: stringConfig(config, "tls_mode", "terminate"),
HTTPPort: intConfig(config, "listen_http_port", 80),
HTTPSPort: intConfig(config, "listen_https_port", 443),
},
HTTPAddr: stringConfig(config, "listen_http_addr", ":80"),
HTTPSAddr: stringConfig(config, "listen_https_addr", ":443"),
TLSCertFile: stringConfig(config, "tls_cert_file", ""),
TLSKeyFile: stringConfig(config, "tls_key_file", ""),
@@ -279,17 +284,13 @@ func webIngressListenerConfig(serviceType string, config map[string]any) webingr
}
func (s StubSupervisor) webIngressContract(serviceType string, config map[string]any) map[string]any {
httpPort := intConfig(config, "listen_http_port", 80)
httpsPort := intConfig(config, "listen_https_port", 443)
tlsMode := strings.TrimSpace(stringConfig(config, "tls_mode", "terminate"))
serviceClasses := stringSliceConfig(config, "service_classes")
serviceClasses := webIngressServiceClasses(serviceType, config)
scope := strings.TrimSpace(stringConfig(config, "scope", ""))
realListenerRequested := boolConfig(config, "real_listener_enabled")
allowedClasses := webIngressAllowedServiceClasses(serviceType)
missing := []string{}
if httpPort != 80 {
missing = append(missing, "listen_http_port_must_be_80")
}
if httpsPort != 443 {
missing = append(missing, "listen_https_port_must_be_443")
}
@@ -315,14 +316,13 @@ func (s StubSupervisor) webIngressContract(serviceType string, config map[string
"authority_service": false,
"fabric_transport": "quic_only",
"http_between_fabric_nodes": false,
"listen_http_port": httpPort,
"listen_https_port": httpsPort,
"tls_mode": tlsMode,
"scope": scope,
"service_classes": serviceClasses,
"allowed_service_classes": allowedClasses,
"fabric_service_channel_required": true,
"runtime_roles_required": webIngressRuntimeRoles(serviceClasses),
"runtime_fabric_functions": webIngressFabricFunctions(serviceType, serviceClasses),
"payload_forwarding": "contract_only",
"real_listener_requested": realListenerRequested,
"real_listener_runtime_enabled": s.WebIngressRuntimeEnabled,
@@ -346,26 +346,41 @@ func (s StubSupervisor) webIngressContract(serviceType string, config map[string
func webIngressAllowedServiceClasses(serviceType string) []string {
if serviceType == "admin-ingress" {
return []string{"platform_admin", "cluster_admin"}
return []string{"admin-ingress"}
}
return []string{"organization_portal", "user_portal"}
return []string{"public-ingress"}
}
func webIngressRuntimeRoles(serviceClasses []string) []string {
roles := []string{}
for _, serviceClass := range serviceClasses {
func webIngressServiceClasses(serviceType string, config map[string]any) []string {
raw := stringSliceConfig(config, "service_classes")
if len(raw) == 0 {
return webIngressAllowedServiceClasses(serviceType)
}
out := []string{}
for _, serviceClass := range raw {
serviceClass = strings.TrimSpace(serviceClass)
switch serviceClass {
case "platform_admin":
roles = append(roles, "global-admin-runtime", "identity-runtime", "policy-authority", "audit-sink")
case "cluster_admin":
roles = append(roles, "cluster-admin-runtime", "identity-runtime", "policy-authority", "audit-sink")
case "organization_portal":
roles = append(roles, "organization-portal-runtime", "identity-runtime", "policy-authority", "audit-sink")
case "user_portal":
roles = append(roles, "user-portal-runtime", "identity-runtime", "policy-authority", "audit-sink")
case "admin-ingress", "public-ingress":
out = append(out, serviceClass)
}
}
return dedupeStrings(roles)
if len(out) == 0 {
return webIngressAllowedServiceClasses(serviceType)
}
return dedupeStrings(out)
}
func webIngressFabricFunctions(serviceType string, serviceClasses []string) []string {
functions := []string{serviceType}
for _, serviceClass := range serviceClasses {
switch serviceClass {
case "admin-ingress":
functions = append(functions, "admin-ingress")
case "public-ingress":
functions = append(functions, "public-ingress")
}
}
return dedupeStrings(functions)
}
func boolConfig(values map[string]any, key string) bool {
@@ -732,7 +747,7 @@ func serviceTrafficMode(serviceType string) string {
switch serviceType {
case "core-mesh":
return "fabric_control"
case "mesh-listener":
case "fabric-listener":
return "entry_listener"
default:
return "unknown"
@@ -2,7 +2,16 @@ package supervisor
import (
"context"
"crypto/rand"
"crypto/rsa"
"crypto/x509"
"crypto/x509/pkix"
"encoding/pem"
"math/big"
"os"
"path/filepath"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/webingress"
@@ -56,7 +65,7 @@ func TestStubSupervisorRunsInternalSyntheticEchoWorkload(t *testing.T) {
func TestStubSupervisorReportsBuiltinFabricServicesRunning(t *testing.T) {
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
{ServiceType: "core-mesh", DesiredState: "enabled", RuntimeMode: "container"},
{ServiceType: "mesh-listener", DesiredState: "enabled", RuntimeMode: "container"},
{ServiceType: "fabric-listener", DesiredState: "enabled", RuntimeMode: "container"},
})
if err != nil {
t.Fatalf("apply desired workload: %v", err)
@@ -88,7 +97,7 @@ func TestStubSupervisorReportsVPNFabricOnlyContractsRunning(t *testing.T) {
},
},
{
ServiceType: "vpn-client",
ServiceType: "ipv4-ingress",
DesiredState: "enabled",
RuntimeMode: "native",
Config: map[string]any{
@@ -117,14 +126,18 @@ func TestStubSupervisorReportsVPNFabricOnlyContractsRunning(t *testing.T) {
if status.StatusPayload["backend_relay_fallback"] != false {
t.Fatalf("backend_relay_fallback = %v", status.StatusPayload["backend_relay_fallback"])
}
if status.StatusPayload["legacy_protocol_compatibility"] != false {
t.Fatalf("legacy_protocol_compatibility = %v", status.StatusPayload["legacy_protocol_compatibility"])
if status.StatusPayload["compat_protocol_compatibility"] != false {
t.Fatalf("compat_protocol_compatibility = %v", status.StatusPayload["compat_protocol_compatibility"])
}
}
if statuses[0].StatusPayload["role"] != "ipv4-egress" || statuses[0].StatusPayload["internet_egress"] != true {
t.Fatalf("ipv4 egress payload = %#v", statuses[0].StatusPayload)
}
if statuses[1].StatusPayload["role"] != "vpn-client" || statuses[1].StatusPayload["android_node_supported"] != true {
if statuses[1].StatusPayload["role"] != "ipv4-ingress" ||
statuses[1].StatusPayload["legacy_role_alias"] != "vpn-client" ||
statuses[1].StatusPayload["android_node_supported"] != true ||
statuses[1].StatusPayload["linux_node_supported"] != true ||
statuses[1].StatusPayload["windows_node_supported"] != true {
t.Fatalf("vpn client payload = %#v", statuses[1].StatusPayload)
}
exitBinding := statuses[0].StatusPayload["service_binding"].(map[string]any)
@@ -132,9 +145,12 @@ func TestStubSupervisorReportsVPNFabricOnlyContractsRunning(t *testing.T) {
t.Fatalf("ipv4 egress binding = %#v", exitBinding)
}
clientBinding := statuses[1].StatusPayload["service_binding"].(map[string]any)
if clientBinding["type"] != "local_ipv4_ingress" || clientBinding["preferred_exit_pool_id"] != "us-los-angeles-ipv4" || clientBinding["legacy_protocol_listener"] != false {
if clientBinding["type"] != "local_ipv4_ingress" || clientBinding["preferred_exit_pool_id"] != "us-los-angeles-ipv4" || clientBinding["compat_protocol_listener"] != false {
t.Fatalf("vpn client binding = %#v", clientBinding)
}
if clientBinding["traffic_visibility"] != "opaque_ipv4_packets" || clientBinding["flow_distribution"] != "opaque_packet_hash_shards" {
t.Fatalf("ipv4 ingress binding should be opaque: %#v", clientBinding)
}
if got := clientBinding["listen_tcp_ports"].([]int); len(got) != 2 || got[0] != 443 || got[1] != 8443 {
t.Fatalf("listen_tcp_ports = %#v", got)
}
@@ -150,11 +166,10 @@ func TestStubSupervisorReportsWebIngressContractReady(t *testing.T) {
DesiredState: "enabled",
RuntimeMode: "native",
Config: map[string]any{
"listen_http_port": 80,
"listen_https_port": 443,
"tls_mode": "terminate",
"scope": "platform",
"service_classes": []any{"platform_admin", "cluster_admin"},
"service_classes": []any{"admin-ingress", "admin-ingress"},
},
},
})
@@ -175,9 +190,9 @@ func TestStubSupervisorReportsWebIngressContractReady(t *testing.T) {
payload["ports_opened_by_stub"] != false {
t.Fatalf("unexpected payload: %#v", payload)
}
roles, ok := payload["runtime_roles_required"].([]string)
if !ok || !containsString(roles, "global-admin-runtime") || !containsString(roles, "policy-authority") {
t.Fatalf("runtime roles = %#v", payload["runtime_roles_required"])
functions, ok := payload["runtime_fabric_functions"].([]string)
if !ok || !containsString(functions, "admin-ingress") {
t.Fatalf("runtime fabric functions = %#v", payload["runtime_fabric_functions"])
}
}
@@ -188,11 +203,10 @@ func TestStubSupervisorBlocksWebIngressRealListenerWithoutRuntimeGate(t *testing
DesiredState: "enabled",
RuntimeMode: "native",
Config: map[string]any{
"listen_http_port": 80,
"listen_https_port": 443,
"tls_mode": "terminate",
"scope": "platform",
"service_classes": []any{"platform_admin"},
"service_classes": []any{"admin-ingress"},
"real_listener_enabled": true,
},
},
@@ -220,11 +234,10 @@ func TestStubSupervisorAllowsWebIngressRealListenerGateButDoesNotOpenPorts(t *te
DesiredState: "enabled",
RuntimeMode: "native",
Config: map[string]any{
"listen_http_port": 80,
"listen_https_port": 443,
"tls_mode": "terminate",
"scope": "platform",
"service_classes": []any{"platform_admin"},
"service_classes": []any{"admin-ingress"},
"real_listener_enabled": true,
},
},
@@ -245,6 +258,8 @@ func TestStubSupervisorAllowsWebIngressRealListenerGateButDoesNotOpenPorts(t *te
}
func TestStubSupervisorStartsWebIngressManagerWhenRealListenerAllowed(t *testing.T) {
dir := t.TempDir()
certFile, keyFile := writeSelfSignedCert(t, dir)
manager := webingress.NewManager()
statuses, err := (StubSupervisor{Version: "test", WebIngressRuntimeEnabled: true, WebIngressManager: manager}).Apply(context.Background(), []client.DesiredWorkload{
{
@@ -252,13 +267,13 @@ func TestStubSupervisorStartsWebIngressManagerWhenRealListenerAllowed(t *testing
DesiredState: "enabled",
RuntimeMode: "native",
Config: map[string]any{
"listen_http_port": 80,
"listen_https_port": 443,
"listen_http_addr": "127.0.0.1:0",
"listen_https_addr": "127.0.0.1:0",
"tls_mode": "terminate",
"tls_cert_file": certFile,
"tls_key_file": keyFile,
"scope": "platform",
"service_classes": []any{"platform_admin"},
"service_classes": []any{"admin-ingress"},
"real_listener_enabled": true,
},
},
@@ -266,7 +281,7 @@ func TestStubSupervisorStartsWebIngressManagerWhenRealListenerAllowed(t *testing
if err != nil {
t.Fatalf("apply desired workload: %v", err)
}
if statuses[0].ReportedState != "degraded" {
if statuses[0].ReportedState != "running" {
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
}
payload := statuses[0].StatusPayload
@@ -274,15 +289,44 @@ func TestStubSupervisorStartsWebIngressManagerWhenRealListenerAllowed(t *testing
if !ok {
t.Fatalf("listener_status = %#v", payload["listener_status"])
}
if !listenerStatus.HTTPRunning || listenerStatus.HTTPSRunning || listenerStatus.HTTPAddr == "" {
if !listenerStatus.HTTPSRunning || listenerStatus.HTTPSAddr == "" {
t.Fatalf("listener status = %+v", listenerStatus)
}
if payload["reason"] != "web_ingress_listener_partial" || payload["ports_opened_by_runtime"] != true || payload["ports_opened_by_stub"] != false {
if payload["reason"] != "web_ingress_contract_ready" || payload["ports_opened_by_runtime"] != true || payload["ports_opened_by_stub"] != false {
t.Fatalf("payload = %#v", payload)
}
_ = manager.Stop(context.Background())
}
func writeSelfSignedCert(t *testing.T, dir string) (string, string) {
t.Helper()
key, err := rsa.GenerateKey(rand.Reader, 2048)
if err != nil {
t.Fatalf("generate key: %v", err)
}
template := x509.Certificate{
SerialNumber: big.NewInt(1),
Subject: pkix.Name{CommonName: "localhost"},
NotBefore: time.Now().Add(-time.Hour),
NotAfter: time.Now().Add(time.Hour),
KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature,
DNSNames: []string{"localhost"},
}
der, err := x509.CreateCertificate(rand.Reader, &template, &template, &key.PublicKey, key)
if err != nil {
t.Fatalf("create cert: %v", err)
}
certFile := filepath.Join(dir, "cert.pem")
keyFile := filepath.Join(dir, "key.pem")
if err := os.WriteFile(certFile, pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der}), 0o600); err != nil {
t.Fatalf("write cert: %v", err)
}
if err := os.WriteFile(keyFile, pem.EncodeToMemory(&pem.Block{Type: "RSA PRIVATE KEY", Bytes: x509.MarshalPKCS1PrivateKey(key)}), 0o600); err != nil {
t.Fatalf("write key: %v", err)
}
return certFile, keyFile
}
func TestStubSupervisorBlocksInvalidWebIngressContract(t *testing.T) {
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
{
@@ -290,10 +334,9 @@ func TestStubSupervisorBlocksInvalidWebIngressContract(t *testing.T) {
DesiredState: "enabled",
RuntimeMode: "native",
Config: map[string]any{
"listen_http_port": 8080,
"listen_https_port": 443,
"listen_https_port": 444,
"scope": "organization",
"service_classes": []any{"platform_admin"},
"service_classes": []any{"admin-ingress"},
},
},
})
@@ -308,7 +351,7 @@ func TestStubSupervisorBlocksInvalidWebIngressContract(t *testing.T) {
t.Fatalf("unexpected payload: %#v", payload)
}
missing, ok := payload["missing_checks"].([]string)
if !ok || !containsString(missing, "listen_http_port_must_be_80") || !containsString(missing, "service_class_not_allowed:platform_admin") {
if !ok || !containsString(missing, "listen_https_port_must_be_443") || !containsString(missing, "service_class_not_allowed:admin-ingress") {
t.Fatalf("missing checks = %#v", payload["missing_checks"])
}
}
@@ -3,6 +3,7 @@ package vpnruntime
import (
"context"
"encoding/binary"
"encoding/json"
"errors"
"fmt"
"time"
@@ -12,10 +13,11 @@ import (
)
const (
fabricVPNPacketPayloadMagic uint32 = 0x52565042 // RVPB
fabricVPNPacketPayloadVersion uint8 = 1
fabricVPNPacketPayloadHeader = 24
fabricVPNPacketMaxPacketCount = 2048
fabricVPNPacketPayloadMagic uint32 = 0x52565042 // RVPB
fabricVPNPacketPayloadVersion uint8 = 2
fabricVPNPacketPayloadHeader = 24
fabricVPNPacketMaxPacketCount = 2048
fabricVPNPacketMaxMetadataBytes = 64 * 1024
fabricVPNPacketDirectionClientToGateway uint8 = 1
fabricVPNPacketDirectionGatewayToClient uint8 = 2
@@ -32,6 +34,7 @@ type FabricVPNPacketFrameInput struct {
VPNConnectionID string
Direction string
TrafficClass string
ServiceTunnel FabricServiceTunnel
Packets [][]byte
Now time.Time
}
@@ -60,6 +63,26 @@ func NewFabricVPNPacketDataFrame(input FabricVPNPacketFrameInput) (fabricproto.F
}, nil
}
func NewFabricVPNSessionHelloFrame(input FabricVPNPacketFrameInput) (fabricproto.Frame, error) {
if input.StreamID == 0 {
return fabricproto.Frame{}, fmt.Errorf("%w: missing stream id", ErrFabricVPNPacketFrameInvalid)
}
if input.VPNConnectionID == "" || input.Direction == "" {
return fabricproto.Frame{}, fmt.Errorf("%w: missing vpn identity", ErrFabricVPNPacketFrameInvalid)
}
payload, err := encodeFabricVPNPacketPayload(input, nil)
if err != nil {
return fabricproto.Frame{}, err
}
return fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricFrameTrafficClass(input.TrafficClass, nil),
StreamID: input.StreamID,
Sequence: input.Sequence,
Payload: payload,
}, nil
}
func DecodeFabricVPNPacketDataFrame(frame fabricproto.Frame) (mesh.VPNPacketBatchPayload, error) {
if frame.Type != fabricproto.FrameData || frame.StreamID == 0 {
return mesh.VPNPacketBatchPayload{}, fmt.Errorf("%w: expected DATA stream frame", ErrFabricVPNPacketFrameInvalid)
@@ -94,11 +117,19 @@ func encodeFabricVPNPacketPayload(input FabricVPNPacketFrameInput, packets [][]b
if len(vpnID) > 0xffff {
return nil, fmt.Errorf("%w: vpn connection id too long", ErrFabricVPNPacketPayload)
}
var metadata []byte
if len(packets) == 0 {
var err error
metadata, err = encodeFabricVPNPacketServiceMetadata(input)
if err != nil {
return nil, err
}
}
now := input.Now.UTC()
if now.IsZero() {
now = time.Now().UTC()
}
total := fabricVPNPacketPayloadHeader + len(vpnID)
total := fabricVPNPacketPayloadHeader + len(vpnID) + len(metadata)
for _, packet := range packets {
total += 4 + len(packet)
}
@@ -108,10 +139,13 @@ func encodeFabricVPNPacketPayload(input FabricVPNPacketFrameInput, packets [][]b
out[5] = directionCode
binary.BigEndian.PutUint16(out[6:8], uint16(len(packets)))
binary.BigEndian.PutUint16(out[8:10], uint16(len(vpnID)))
binary.BigEndian.PutUint16(out[10:12], uint16(len(metadata)))
binary.BigEndian.PutUint64(out[12:20], uint64(now.UnixNano()))
offset := fabricVPNPacketPayloadHeader
copy(out[offset:], vpnID)
offset += len(vpnID)
copy(out[offset:], metadata)
offset += len(metadata)
for _, packet := range packets {
binary.BigEndian.PutUint32(out[offset:offset+4], uint32(len(packet)))
offset += 4
@@ -128,7 +162,8 @@ func decodeFabricVPNPacketPayload(payload []byte) (mesh.VPNPacketBatchPayload, e
if binary.BigEndian.Uint32(payload[0:4]) != fabricVPNPacketPayloadMagic {
return mesh.VPNPacketBatchPayload{}, fmt.Errorf("%w: bad magic", ErrFabricVPNPacketPayload)
}
if payload[4] != fabricVPNPacketPayloadVersion {
version := payload[4]
if version != 1 && version != fabricVPNPacketPayloadVersion {
return mesh.VPNPacketBatchPayload{}, fmt.Errorf("%w: unsupported version %d", ErrFabricVPNPacketPayload, payload[4])
}
direction, err := fabricVPNPacketDirectionName(payload[5])
@@ -137,7 +172,11 @@ func decodeFabricVPNPacketPayload(payload []byte) (mesh.VPNPacketBatchPayload, e
}
packetCount := int(binary.BigEndian.Uint16(payload[6:8]))
vpnIDLength := int(binary.BigEndian.Uint16(payload[8:10]))
if packetCount <= 0 || packetCount > fabricVPNPacketMaxPacketCount {
metadataLength := 0
if version >= 2 {
metadataLength = int(binary.BigEndian.Uint16(payload[10:12]))
}
if packetCount < 0 || packetCount > fabricVPNPacketMaxPacketCount {
return mesh.VPNPacketBatchPayload{}, fmt.Errorf("%w: invalid packet count %d", ErrFabricVPNPacketPayload, packetCount)
}
offset := fabricVPNPacketPayloadHeader
@@ -149,6 +188,16 @@ func decodeFabricVPNPacketPayload(payload []byte) (mesh.VPNPacketBatchPayload, e
if vpnID == "" {
return mesh.VPNPacketBatchPayload{}, fmt.Errorf("%w: empty vpn id", ErrFabricVPNPacketPayload)
}
metadata := fabricVPNPacketServiceMetadata{}
if metadataLength > 0 {
if metadataLength > fabricVPNPacketMaxMetadataBytes || len(payload) < offset+metadataLength {
return mesh.VPNPacketBatchPayload{}, fmt.Errorf("%w: truncated service metadata", ErrFabricVPNPacketPayload)
}
if err := json.Unmarshal(payload[offset:offset+metadataLength], &metadata); err != nil {
return mesh.VPNPacketBatchPayload{}, fmt.Errorf("%w: invalid service metadata: %v", ErrFabricVPNPacketPayload, err)
}
offset += metadataLength
}
packets := make([][]byte, 0, packetCount)
for index := 0; index < packetCount; index++ {
if len(payload) < offset+4 {
@@ -169,12 +218,74 @@ func decodeFabricVPNPacketPayload(payload []byte) (mesh.VPNPacketBatchPayload, e
return mesh.VPNPacketBatchPayload{
SchemaVersion: "rap.vpn_packet_batch.fabric.v1",
VPNConnectionID: vpnID,
TunnelID: firstNonEmptyTunnelString(metadata.TunnelID, vpnID),
PoolID: metadata.PoolID,
ServiceID: metadata.ServiceID,
LocalServiceID: metadata.LocalServiceID,
RemoteServiceID: metadata.RemoteServiceID,
ServiceKind: metadata.ServiceKind,
ServiceClass: metadata.ServiceClass,
ServiceRole: metadata.ServiceRole,
RouteLeaseID: metadata.RouteLeaseID,
RouteGeneration: metadata.RouteGeneration,
DataPlane: metadata.DataPlane,
TransportOwner: metadata.TransportOwner,
RouteVisibility: metadata.RouteVisibility,
TrafficClasses: metadata.TrafficClasses,
StreamShards: metadata.StreamShards,
Direction: direction,
Packets: packets,
SentAt: sentAt,
}, nil
}
type fabricVPNPacketServiceMetadata struct {
TunnelID string `json:"tunnel_id,omitempty"`
PoolID string `json:"pool_id,omitempty"`
ServiceID string `json:"service_id,omitempty"`
LocalServiceID string `json:"local_service_id,omitempty"`
RemoteServiceID string `json:"remote_service_id,omitempty"`
ServiceKind string `json:"service_kind,omitempty"`
ServiceClass string `json:"service_class,omitempty"`
ServiceRole string `json:"service_role,omitempty"`
RouteLeaseID string `json:"route_lease_id,omitempty"`
RouteGeneration string `json:"route_generation,omitempty"`
DataPlane string `json:"data_plane,omitempty"`
TransportOwner string `json:"transport_owner,omitempty"`
RouteVisibility string `json:"route_visibility,omitempty"`
TrafficClasses []string `json:"traffic_classes,omitempty"`
StreamShards int `json:"stream_shards,omitempty"`
}
func encodeFabricVPNPacketServiceMetadata(input FabricVPNPacketFrameInput) ([]byte, error) {
tunnel := NormalizeServiceTunnel(input.ServiceTunnel, input.VPNConnectionID)
metadata := fabricVPNPacketServiceMetadata{
TunnelID: firstNonEmptyTunnelString(tunnel.TunnelID, input.VPNConnectionID),
PoolID: tunnel.PoolID,
ServiceID: tunnel.ServiceID,
LocalServiceID: tunnel.LocalServiceID,
RemoteServiceID: tunnel.RemoteServiceID,
ServiceKind: tunnel.ServiceKind,
ServiceClass: tunnel.ServiceClass,
ServiceRole: tunnel.ServiceRole,
RouteLeaseID: tunnel.RouteLeaseID,
RouteGeneration: tunnel.RouteGeneration,
DataPlane: tunnel.DataPlane,
TransportOwner: tunnel.TransportOwner,
RouteVisibility: tunnel.RouteVisibility,
TrafficClasses: append([]string(nil), tunnel.TrafficClasses...),
StreamShards: tunnel.StreamShards,
}
payload, err := json.Marshal(metadata)
if err != nil {
return nil, err
}
if len(payload) > fabricVPNPacketMaxMetadataBytes || len(payload) > 0xffff {
return nil, fmt.Errorf("%w: service metadata too large", ErrFabricVPNPacketPayload)
}
return payload, nil
}
func fabricVPNPacketDirectionCode(direction string) (uint8, error) {
switch direction {
case FabricDirectionClientToGateway:
@@ -201,6 +312,8 @@ func fabricFrameTrafficClass(trafficClass string, packets [][]byte) fabricproto.
switch normalizeFabricTrafficClass(trafficClass) {
case FabricTrafficClassControl:
return fabricproto.TrafficClassControl
case FabricTrafficClassDNS:
return fabricproto.TrafficClassReliable
case FabricTrafficClassInteractive:
return fabricproto.TrafficClassInteractive
case FabricTrafficClassReliable:
@@ -208,9 +321,6 @@ func fabricFrameTrafficClass(trafficClass string, packets [][]byte) fabricproto.
case FabricTrafficClassDroppable:
return fabricproto.TrafficClassDroppable
default:
if batchHasTCPControlPacket(packets) {
return fabricproto.TrafficClassInteractive
}
return fabricproto.TrafficClassBulk
}
}
@@ -14,11 +14,16 @@ type FabricSessionFrameWriter interface {
}
type FabricSessionPacketPeerRegistry struct {
mu sync.RWMutex
peers map[string]FabricSessionPacketPeer
mu sync.RWMutex
peers map[string]FabricSessionPacketPeer
changed chan struct{}
}
type FabricSessionPacketPeer struct {
TunnelID string
PoolID string
ServiceID string
ServiceTunnel FabricServiceTunnel
VPNConnectionID string
Sender FabricSessionFrameWriter
StreamID uint64
@@ -30,11 +35,17 @@ type FabricSessionPacketPeer struct {
type FabricSessionPacketPeerTransport struct {
Registry *FabricSessionPacketPeerRegistry
Inbox *FabricPacketInbox
TunnelID string
PoolID string
ServiceID string
VPNConnectionID string
PeerWaitTimeout time.Duration
}
const defaultFabricSessionPeerWaitTimeout = 500 * time.Millisecond
func NewFabricSessionPacketPeerRegistry() *FabricSessionPacketPeerRegistry {
return &FabricSessionPacketPeerRegistry{peers: map[string]FabricSessionPacketPeer{}}
return &FabricSessionPacketPeerRegistry{peers: map[string]FabricSessionPacketPeer{}, changed: make(chan struct{})}
}
func (r *FabricSessionPacketPeerRegistry) RegisterFrame(ctx context.Context, sender FabricSessionFrameWriter, frame fabricproto.Frame) (bool, error) {
@@ -53,10 +64,33 @@ func (r *FabricSessionPacketPeerRegistry) RegisterFrame(ctx context.Context, sen
if r.peers == nil {
r.peers = map[string]FabricSessionPacketPeer{}
}
if r.changed == nil {
r.changed = make(chan struct{})
}
peer := r.peers[payload.VPNConnectionID]
if peer.RegisteredAt.IsZero() {
peer.RegisteredAt = now
}
peer.ServiceTunnel = NormalizeServiceTunnel(FabricServiceTunnel{
TunnelID: firstNonEmptyTunnelString(payload.TunnelID, payload.VPNConnectionID),
PoolID: payload.PoolID,
ServiceID: payload.ServiceID,
LocalServiceID: payload.LocalServiceID,
RemoteServiceID: payload.RemoteServiceID,
ServiceKind: payload.ServiceKind,
ServiceClass: payload.ServiceClass,
ServiceRole: payload.ServiceRole,
RouteLeaseID: payload.RouteLeaseID,
RouteGeneration: payload.RouteGeneration,
DataPlane: payload.DataPlane,
TransportOwner: payload.TransportOwner,
RouteVisibility: payload.RouteVisibility,
TrafficClasses: payload.TrafficClasses,
StreamShards: payload.StreamShards,
}, payload.VPNConnectionID)
peer.TunnelID = peer.ServiceTunnel.TunnelID
peer.PoolID = peer.ServiceTunnel.PoolID
peer.ServiceID = peer.ServiceTunnel.ServiceID
peer.VPNConnectionID = payload.VPNConnectionID
peer.Sender = sender
peer.StreamID = frame.StreamID
@@ -69,6 +103,7 @@ func (r *FabricSessionPacketPeerRegistry) RegisterFrame(ctx context.Context, sen
peer.StreamIDsByTrafficClass[trafficClass] = append(peer.StreamIDsByTrafficClass[trafficClass], frame.StreamID)
}
r.peers[payload.VPNConnectionID] = peer
r.signalLocked()
r.mu.Unlock()
return true, nil
}
@@ -84,25 +119,93 @@ func (r *FabricSessionPacketPeerRegistry) TransportFor(vpnConnectionID string, i
return nil
}
return &FabricSessionPacketTransport{
Sender: fabricSessionFrameWriterAdapter{writer: peer.Sender},
Inbox: inbox,
StreamID: peer.StreamID,
StreamIDsByTrafficClass: copyStreamIDsByClass(peer.StreamIDsByTrafficClass),
VPNConnectionID: vpnConnectionID,
SendDirection: FabricDirectionGatewayToClient,
ReceiveDirection: FabricDirectionClientToGateway,
Sender: fabricSessionFrameWriterAdapter{writer: peer.Sender},
Inbox: inbox,
StreamID: peer.StreamID,
ServiceTunnel: peer.ServiceTunnel,
TunnelID: vpnConnectionID,
PoolID: peer.PoolID,
ServiceID: peer.ServiceID,
VPNConnectionID: vpnConnectionID,
SendDirection: FabricDirectionGatewayToClient,
ReceiveDirection: FabricDirectionClientToGateway,
}
}
func (r *FabricSessionPacketPeerRegistry) WaitTransportFor(ctx context.Context, vpnConnectionID string, inbox *FabricPacketInbox, timeout time.Duration) PacketTransport {
if timeout <= 0 {
return r.TransportFor(vpnConnectionID, inbox)
}
timer := time.NewTimer(timeout)
defer timer.Stop()
for {
if transport := r.TransportFor(vpnConnectionID, inbox); transport != nil {
return transport
}
changed := r.changedChannel()
select {
case <-ctx.Done():
return nil
case <-timer.C:
return nil
case <-changed:
}
}
}
func (r *FabricSessionPacketPeerRegistry) Forget(vpnConnectionID string) {
if r == nil || vpnConnectionID == "" {
return
}
r.mu.Lock()
if r.changed == nil {
r.changed = make(chan struct{})
}
delete(r.peers, vpnConnectionID)
r.signalLocked()
r.mu.Unlock()
}
func (r *FabricSessionPacketPeerRegistry) changedChannel() <-chan struct{} {
if r == nil {
return nil
}
r.mu.Lock()
defer r.mu.Unlock()
if r.changed == nil {
r.changed = make(chan struct{})
}
return r.changed
}
func (r *FabricSessionPacketPeerRegistry) signalLocked() {
if r == nil {
return
}
if r.changed == nil {
r.changed = make(chan struct{})
}
close(r.changed)
r.changed = make(chan struct{})
}
func (t *FabricSessionPacketPeerTransport) SendGatewayPacketBatch(ctx context.Context, packets [][]byte) error {
if t == nil || t.Registry == nil || t.Inbox == nil || t.VPNConnectionID == "" {
return mesh.ErrForwardRuntimeUnavailable
}
transport := t.Registry.TransportFor(t.VPNConnectionID, t.Inbox)
waitTimeout := t.PeerWaitTimeout
if waitTimeout <= 0 {
waitTimeout = defaultFabricSessionPeerWaitTimeout
}
transport := t.Registry.WaitTransportFor(ctx, t.VPNConnectionID, t.Inbox, waitTimeout)
if transport == nil {
return mesh.ErrForwardRuntimeUnavailable
}
return transport.SendGatewayPacketBatch(ctx, packets)
if err := transport.SendGatewayPacketBatch(ctx, packets); err != nil {
t.Registry.Forget(t.VPNConnectionID)
return err
}
return nil
}
func (t *FabricSessionPacketPeerTransport) ReceiveGatewayPacketBatch(ctx context.Context, timeout time.Duration) ([][]byte, error) {
@@ -126,9 +229,12 @@ func (t *FabricSessionPacketPeerTransport) Snapshot() map[string]any {
}
}
return map[string]any{
"transport": "fabric_session_peer_dynamic",
"vpn_connection_id": t.VPNConnectionID,
"peer_ready": ready == 1,
"transport": "fabric_session_peer_dynamic",
"tunnel_id": firstNonEmptyTunnelString(t.TunnelID, t.VPNConnectionID),
"pool_id": t.PoolID,
"service_id": t.ServiceID,
"vpn_connection_id_alias": t.VPNConnectionID,
"peer_ready": ready == 1,
}
}
@@ -142,8 +248,12 @@ func (r *FabricSessionPacketPeerRegistry) Snapshot() map[string]any {
items := make([]map[string]any, 0, len(r.peers))
for _, peer := range r.peers {
item := map[string]any{
"vpn_connection_id": peer.VPNConnectionID,
"stream_id": peer.StreamID,
"tunnel_id": firstNonEmptyTunnelString(peer.TunnelID, peer.VPNConnectionID),
"pool_id": peer.PoolID,
"service_id": peer.ServiceID,
"vpn_connection_id_alias": peer.VPNConnectionID,
"service_tunnel": peer.ServiceTunnel.Snapshot(),
"stream_id": peer.StreamID,
}
if !peer.RegisteredAt.IsZero() {
item["registered_at"] = peer.RegisteredAt.Format(time.RFC3339Nano)
@@ -31,6 +31,11 @@ type FabricSessionPacketTransport struct {
Inbox *FabricPacketInbox
StreamID uint64
ServiceStreams *FabricServiceStreamRegistry
ServiceTunnel FabricServiceTunnel
TunnelID string
PoolID string
ServiceID string
VPNConnectionID string
SendDirection string
ReceiveDirection string
@@ -39,6 +44,12 @@ type FabricSessionPacketTransport struct {
StreamIDsByTrafficClass map[string][]uint64
StreamIDs []uint64
routeMu sync.Mutex
routeLeaseID string
routeGeneration string
routeTransitionCount uint64
routeUpdatedAt time.Time
sequence uint64
sequenceMu sync.Mutex
sequenceByStream map[uint64]uint64
@@ -68,7 +79,12 @@ func (t *FabricSessionPacketTransport) SendGatewayPacketBatch(ctx context.Contex
if t == nil || t.Sender == nil {
return mesh.ErrForwardRuntimeUnavailable
}
if !t.hasSendStream() || t.VPNConnectionID == "" {
t.normalizeServiceTunnel()
packetTunnelID := t.packetTunnelID()
if t.VPNConnectionID == "" {
t.VPNConnectionID = packetTunnelID
}
if !t.hasSendStream() || packetTunnelID == "" {
return errors.New("fabric session packet transport identity is incomplete")
}
direction := t.SendDirection
@@ -77,12 +93,14 @@ func (t *FabricSessionPacketTransport) SendGatewayPacketBatch(ctx context.Contex
}
groups := t.groupPacketsByStream(packets)
for _, group := range groups {
t.registerServiceStream(group.StreamID, group.TrafficClass, direction)
frame, err := NewFabricVPNPacketDataFrame(FabricVPNPacketFrameInput{
StreamID: group.StreamID,
Sequence: t.nextSequence(group.StreamID),
VPNConnectionID: t.VPNConnectionID,
VPNConnectionID: packetTunnelID,
Direction: direction,
TrafficClass: group.TrafficClass,
ServiceTunnel: t.ServiceTunnel,
Packets: group.Packets,
})
if err != nil {
@@ -101,15 +119,17 @@ func (t *FabricSessionPacketTransport) ReceiveGatewayPacketBatch(ctx context.Con
if t == nil || t.Inbox == nil {
return nil, mesh.ErrForwardRuntimeUnavailable
}
t.normalizeServiceTunnel()
packetTunnelID := t.packetTunnelID()
direction := t.ReceiveDirection
if direction == "" {
direction = FabricDirectionClientToGateway
}
if packets, err := t.Inbox.Receive(ctx, t.VPNConnectionID, direction, 5*time.Millisecond); err != nil || len(packets) > 0 {
if packets, err := t.Inbox.Receive(ctx, packetTunnelID, direction, 5*time.Millisecond); err != nil || len(packets) > 0 {
return packets, err
}
if t.Receiver == nil {
return t.Inbox.Receive(ctx, t.VPNConnectionID, direction, timeout)
return t.Inbox.Receive(ctx, packetTunnelID, direction, timeout)
}
if timeout <= 0 {
timeout = 25 * time.Second
@@ -130,14 +150,14 @@ func (t *FabricSessionPacketTransport) ReceiveGatewayPacketBatch(ctx context.Con
continue
}
if err != nil {
if packets, receiveErr := t.Inbox.Receive(ctx, t.VPNConnectionID, direction, 100*time.Millisecond); receiveErr != nil || len(packets) > 0 {
if packets, receiveErr := t.Inbox.Receive(ctx, packetTunnelID, direction, 100*time.Millisecond); receiveErr != nil || len(packets) > 0 {
return packets, receiveErr
}
return nil, err
}
case frame, ok := <-frames:
if !ok {
return t.Inbox.Receive(ctx, t.VPNConnectionID, direction, 100*time.Millisecond)
return t.Inbox.Receive(ctx, packetTunnelID, direction, 100*time.Millisecond)
}
if frame.Type != fabricproto.FrameData || !t.acceptsStream(frame.StreamID) {
continue
@@ -146,7 +166,7 @@ func (t *FabricSessionPacketTransport) ReceiveGatewayPacketBatch(ctx context.Con
if err != nil {
return nil, err
}
if payload.VPNConnectionID == t.VPNConnectionID && payload.Direction == direction {
if payload.VPNConnectionID == packetTunnelID && payload.Direction == direction {
t.recordReceive(frame.StreamID, fabricSessionTrafficClassName(frame.TrafficClass), len(payload.Packets))
return cleanPacketBatch(payload.Packets), nil
}
@@ -222,7 +242,8 @@ func (t *FabricSessionPacketTransport) Close() error {
if t.closeErr == nil {
t.closeErr = err
}
} else if err == nil {
} else {
t.markServiceStreamClosed(streamID)
t.recordCloseStream()
}
}
@@ -334,7 +355,13 @@ func (t *FabricSessionPacketTransport) streamIDsForTrafficClass(trafficClass str
if ids := t.StreamIDsByTrafficClass[normalizeFabricTrafficClass(trafficClass)]; len(ids) > 0 {
return ids
}
if normalizeFabricTrafficClass(trafficClass) == FabricTrafficClassReliable {
switch normalizeFabricTrafficClass(trafficClass) {
case FabricTrafficClassDNS:
if ids := t.StreamIDsByTrafficClass[FabricTrafficClassReliable]; len(ids) > 0 {
return ids
}
return t.StreamIDsByTrafficClass[FabricTrafficClassBulk]
case FabricTrafficClassReliable:
return t.StreamIDsByTrafficClass[FabricTrafficClassBulk]
}
return nil
@@ -444,6 +471,7 @@ func (t *FabricSessionPacketTransport) Snapshot() map[string]any {
if t == nil {
return nil
}
t.normalizeServiceTunnel()
t.statsMu.Lock()
sendFramesByClass := copyStringUint64Map(t.sendFramesByClass)
sendPacketsByClass := copyStringUint64Map(t.sendPacketsByClass)
@@ -471,9 +499,23 @@ func (t *FabricSessionPacketTransport) Snapshot() map[string]any {
receivePacketsByStream[fmt.Sprintf("%d", streamID)] = count
}
t.statsMu.Unlock()
t.routeMu.Lock()
routeLeaseID := firstNonEmptyTunnelString(t.routeLeaseID, t.ServiceTunnel.RouteLeaseID)
routeGeneration := firstNonEmptyTunnelString(t.routeGeneration, t.ServiceTunnel.RouteGeneration)
routeTransitionCount := t.routeTransitionCount
routeUpdatedAt := t.routeUpdatedAt
t.routeMu.Unlock()
streamIDsByClass := copyStreamIDsByTrafficClass(t.StreamIDsByTrafficClass)
return map[string]any{
out := map[string]any{
"schema_version": "rap.vpn_fabric_session_packet_transport.v1",
"tunnel_id": t.packetTunnelID(),
"pool_id": t.PoolID,
"service_id": t.ServiceID,
"route_lease_id": routeLeaseID,
"route_generation": routeGeneration,
"route_transition_count": routeTransitionCount,
"vpn_connection_id_alias": t.VPNConnectionID,
"service_tunnel": t.ServiceTunnel.Snapshot(),
"stream_id": t.StreamID,
"stream_ids_by_class": streamIDsByClass,
"stream_class_count": len(streamIDsByClass),
@@ -495,6 +537,92 @@ func (t *FabricSessionPacketTransport) Snapshot() map[string]any {
"receive_frames_by_stream_id": receiveFramesByStream,
"receive_packets_by_stream_id": receivePacketsByStream,
}
if t.ServiceStreams != nil {
out["service_stream_registry"] = t.ServiceStreams.Snapshot()
out["service_streams"] = serviceStreamsSnapshotItems(t.ServiceStreams.StreamsForTunnel(t.packetTunnelID()))
}
if !routeUpdatedAt.IsZero() {
out["route_updated_at"] = routeUpdatedAt.UTC().Format(time.RFC3339Nano)
}
return out
}
func (t *FabricSessionPacketTransport) UpdateServiceTunnel(tunnel FabricServiceTunnel) (bool, error) {
if t == nil {
return false, mesh.ErrForwardRuntimeUnavailable
}
currentID := t.packetTunnelID()
tunnel = NormalizeServiceTunnel(tunnel, currentID)
if currentID != "" && tunnel.TunnelID != "" && tunnel.TunnelID != currentID {
return false, fmt.Errorf("service tunnel id changed from %q to %q", currentID, tunnel.TunnelID)
}
t.routeMu.Lock()
defer t.routeMu.Unlock()
previousLeaseID := firstNonEmptyTunnelString(t.routeLeaseID, t.ServiceTunnel.RouteLeaseID)
previousGeneration := firstNonEmptyTunnelString(t.routeGeneration, t.ServiceTunnel.RouteGeneration)
changed := previousLeaseID != tunnel.RouteLeaseID || previousGeneration != tunnel.RouteGeneration
t.ServiceTunnel = tunnel
t.TunnelID = firstNonEmptyTunnelString(t.TunnelID, tunnel.TunnelID)
t.PoolID = firstNonEmptyTunnelString(tunnel.PoolID, t.PoolID)
t.ServiceID = firstNonEmptyTunnelString(tunnel.ServiceID, t.ServiceID)
t.routeLeaseID = tunnel.RouteLeaseID
t.routeGeneration = tunnel.RouteGeneration
if changed {
t.routeTransitionCount++
t.routeUpdatedAt = time.Now().UTC()
}
return changed, nil
}
func (t *FabricSessionPacketTransport) normalizeServiceTunnel() {
if t == nil {
return
}
fallbackID := firstNonEmptyTunnelString(t.ServiceTunnel.TunnelID, t.TunnelID, t.VPNConnectionID)
t.ServiceTunnel = NormalizeServiceTunnel(t.ServiceTunnel, fallbackID)
t.TunnelID = firstNonEmptyTunnelString(t.TunnelID, t.ServiceTunnel.TunnelID)
t.PoolID = firstNonEmptyTunnelString(t.PoolID, t.ServiceTunnel.PoolID)
t.ServiceID = firstNonEmptyTunnelString(t.ServiceID, t.ServiceTunnel.ServiceID)
t.routeMu.Lock()
if t.routeLeaseID == "" {
t.routeLeaseID = t.ServiceTunnel.RouteLeaseID
}
if t.routeGeneration == "" {
t.routeGeneration = t.ServiceTunnel.RouteGeneration
}
t.routeMu.Unlock()
}
func (t *FabricSessionPacketTransport) packetTunnelID() string {
if t == nil {
return ""
}
return firstNonEmptyTunnelString(t.ServiceTunnel.TunnelID, t.TunnelID, t.VPNConnectionID)
}
func (t *FabricSessionPacketTransport) registerServiceStream(streamID uint64, trafficClass string, direction string) {
if t == nil || t.ServiceStreams == nil || streamID == 0 {
return
}
t.normalizeServiceTunnel()
t.ServiceStreams.Register(FabricServiceStream{
TunnelID: t.packetTunnelID(),
ServiceID: t.ServiceID,
StreamID: streamID,
TrafficClass: trafficClass,
Direction: direction,
ServiceTunnel: t.ServiceTunnel,
Metadata: map[string]string{
"adapter": "vpn",
},
})
}
func (t *FabricSessionPacketTransport) markServiceStreamClosed(streamID uint64) {
if t == nil || t.ServiceStreams == nil || streamID == 0 {
return
}
t.ServiceStreams.MarkClosed(t.packetTunnelID(), streamID)
}
func (t *FabricSessionPacketTransport) recordCloseStream() {
@@ -516,12 +644,9 @@ func (t *FabricSessionPacketTransport) recordCloseError() {
}
func fabricSessionTrafficClassForPackets(fallback string, packets [][]byte) string {
if fallback = normalizeFabricTrafficClass(fallback); fallback != "" && fallback != FabricTrafficClassBulk {
if fallback = normalizeFabricTrafficClass(fallback); fallback != "" {
return fallback
}
if batchHasTCPControlPacket(packets) {
return FabricTrafficClassInteractive
}
return FabricTrafficClassBulk
}
@@ -35,6 +35,9 @@ type FabricPacketTransport struct {
Inbox *FabricPacketInbox
ClusterID string
TunnelID string
PoolID string
ServiceID string
VPNConnectionID string
RouteID string
LocalNodeID string
@@ -46,16 +49,16 @@ type FabricPacketTransport struct {
}
type FabricClientPacketIngress struct {
ForwardTransport mesh.ProductionForwardTransport
Inbox *FabricPacketInbox
Routes func() []mesh.SyntheticRoute
LocalGateway func(vpnConnectionID string) bool
AllowLegacyLocalGatewayFallback bool
FlowScheduler *FabricFlowScheduler
MaxParallelFlowSends int
RecoveryPolicyFingerprint string
AdaptivePolicyFingerprint string
PreventLastRouteWithdrawal bool
ForwardTransport mesh.ProductionForwardTransport
Inbox *FabricPacketInbox
Routes func() []mesh.SyntheticRoute
LocalGateway func(vpnConnectionID string) bool
AllowLocalGatewayBypass bool
FlowScheduler *FabricFlowScheduler
MaxParallelFlowSends int
RecoveryPolicyFingerprint string
AdaptivePolicyFingerprint string
PreventLastRouteWithdrawal bool
ClusterID string
LocalNodeID string
@@ -159,6 +162,7 @@ type FabricServiceChannelAdaptivePolicy struct {
const (
FabricTrafficClassControl = "control"
FabricTrafficClassDNS = "dns"
FabricTrafficClassInteractive = "interactive"
FabricTrafficClassReliable = "reliable"
FabricTrafficClassBulk = "bulk"
@@ -370,6 +374,7 @@ func defaultFabricServiceChannelAdaptivePolicy() FabricServiceChannelAdaptivePol
QueuePressureMaxInFlight: defaultFabricFlowParallelSendWindow * 4,
ClassWindows: map[string]int{
FabricTrafficClassControl: defaultFabricFlowParallelSendWindow,
FabricTrafficClassDNS: defaultFabricFlowParallelSendWindow,
FabricTrafficClassInteractive: defaultFabricFlowParallelSendWindow,
FabricTrafficClassReliable: 6,
FabricTrafficClassBulk: 4,
@@ -399,6 +404,7 @@ func normalizeFabricServiceChannelAdaptivePolicy(policy FabricServiceChannelAdap
}
defaults := map[string]int{
FabricTrafficClassControl: policy.MaxParallelWindow,
FabricTrafficClassDNS: policy.MaxParallelWindow,
FabricTrafficClassInteractive: policy.MaxParallelWindow,
FabricTrafficClassReliable: minPositive(policy.MaxParallelWindow, 6),
FabricTrafficClassBulk: minPositive(policy.MaxParallelWindow, 4),
@@ -466,7 +472,7 @@ func (s *FabricFlowScheduler) scheduleClientPackets(vpnConnectionID string, traf
FlowID: flowID,
Shard: shard,
TrafficClass: trafficClass,
Classifier: "ip_5tuple_or_packet_hash",
Classifier: "opaque_packet_hash",
ServiceMode: "application_protocol_agnostic",
}
grouped[channelID] = batch
@@ -1277,6 +1283,8 @@ func normalizeFabricTrafficClass(value string) string {
switch strings.TrimSpace(strings.ToLower(value)) {
case FabricTrafficClassControl:
return FabricTrafficClassControl
case FabricTrafficClassDNS:
return FabricTrafficClassDNS
case FabricTrafficClassInteractive:
return FabricTrafficClassInteractive
case FabricTrafficClassReliable:
@@ -1294,16 +1302,18 @@ func fabricTrafficClassPriority(value string) int {
switch normalizeFabricTrafficClass(value) {
case FabricTrafficClassControl:
return 0
case FabricTrafficClassInteractive:
case FabricTrafficClassDNS:
return 1
case FabricTrafficClassReliable:
case FabricTrafficClassInteractive:
return 2
case FabricTrafficClassReliable:
return 3
case FabricTrafficClassBulk:
return 3
case FabricTrafficClassDroppable:
return 4
case FabricTrafficClassDroppable:
return 5
default:
return 3
return 4
}
}
@@ -1932,7 +1942,7 @@ func (i *FabricClientPacketIngress) ReceiveClientPacketBatch(ctx context.Context
}
func (i *FabricClientPacketIngress) localGatewayReady(vpnConnectionID string) bool {
if i == nil || !i.AllowLegacyLocalGatewayFallback || i.inbox() == nil || vpnConnectionID == "" {
if i == nil || !i.AllowLocalGatewayBypass || i.inbox() == nil || vpnConnectionID == "" {
return false
}
localGateway := i.localGateway()
@@ -2224,9 +2234,6 @@ func (i *FabricPacketInbox) Receive(ctx context.Context, vpnConnectionID, direct
func (i *FabricPacketInbox) enqueue(payload mesh.VPNPacketBatchPayload) error {
queue := i.queue(payload.VPNConnectionID, payload.Direction)
target := queue.normal
if payload.Direction == FabricDirectionGatewayToClient && batchHasTCPControlPacket(payload.Packets) {
target = queue.priority
}
select {
case target <- payload:
default:
@@ -2256,15 +2263,6 @@ func (i *FabricPacketInbox) queue(vpnConnectionID, direction string) *fabricPack
return queue
}
func batchHasTCPControlPacket(packets [][]byte) bool {
for _, packet := range packets {
if isTCPControlPacket(packet) {
return true
}
}
return false
}
func maxInt(a, b int) int {
if a > b {
return a
@@ -2976,7 +2974,7 @@ func classifyPacketFlow(packet []byte, shardCount int) (string, int) {
if shardCount <= 0 {
shardCount = defaultFabricFlowShardCount
}
key := packetFlowKey(packet)
key := packetHashFlowKey("opaque", packet)
hash := fnv.New32a()
_, _ = hash.Write([]byte(key))
shard := int(hash.Sum32() % uint32(shardCount))
@@ -234,6 +234,7 @@ func TestFabricSessionPacketTransportSendsDataFrame(t *testing.T) {
}
func TestFabricSessionPacketTransportShardsStreamsByTrafficClass(t *testing.T) {
t.Skip("retired: base VPN fabric channel is opaque and no longer classifies TCP control packets")
sender := &captureFabricSessionSender{}
transport := &FabricSessionPacketTransport{
Sender: sender,
@@ -284,7 +285,245 @@ func TestFabricSessionPacketTransportShardsStreamsByTrafficClass(t *testing.T) {
}
}
func TestFabricSessionPacketTransportUsesTunnelIDAsServiceIdentity(t *testing.T) {
sender := &captureFabricSessionSender{}
transport := &FabricSessionPacketTransport{
Sender: sender,
StreamID: 700,
TunnelID: "fabric-tunnel-1",
VPNConnectionID: "legacy-vpn-1",
SendDirection: FabricDirectionClientToGateway,
ServiceTunnel: FabricServiceTunnel{
TunnelID: "fabric-tunnel-1",
PoolID: "ipv4-egress",
ServiceID: "svc-vpn-1",
ServiceKind: "ipv4-tunnel",
ServiceClass: "vpn_packets",
},
}
if err := transport.SendGatewayPacketBatch(context.Background(), [][]byte{[]byte("packet")}); err != nil {
t.Fatalf("send packet: %v", err)
}
if len(sender.frames) != 1 {
t.Fatalf("sent frames = %d, want 1", len(sender.frames))
}
payload, err := DecodeFabricVPNPacketDataFrame(sender.frames[0])
if err != nil {
t.Fatalf("decode payload: %v", err)
}
if payload.VPNConnectionID != "fabric-tunnel-1" {
t.Fatalf("payload tunnel identity = %q, want fabric-tunnel-1", payload.VPNConnectionID)
}
if payload.TunnelID != "fabric-tunnel-1" || payload.PoolID != "" || payload.ServiceID != "" {
t.Fatalf("hot data frame should carry only tunnel identity, got %+v", payload)
}
snapshot := transport.Snapshot()
if snapshot["tunnel_id"] != "fabric-tunnel-1" || snapshot["vpn_connection_id_alias"] != "legacy-vpn-1" {
t.Fatalf("snapshot should expose tunnel id and legacy alias: %+v", snapshot)
}
serviceTunnel, ok := snapshot["service_tunnel"].(map[string]any)
if !ok || serviceTunnel["transport_owner"] != DefaultFabricTransportOwner || serviceTunnel["route_visibility"] != DefaultFabricRouteVisibility {
t.Fatalf("service tunnel snapshot missing fabric ownership: %+v", snapshot["service_tunnel"])
}
}
func TestFabricSessionPacketTransportUsesOpaqueBulkChannelForPacketContents(t *testing.T) {
sender := &captureFabricSessionSender{}
transport := &FabricSessionPacketTransport{
Sender: sender,
TunnelID: "fabric-tunnel-1",
VPNConnectionID: "legacy-vpn-1",
SendDirection: FabricDirectionClientToGateway,
StreamIDsByTrafficClass: map[string][]uint64{
FabricTrafficClassReliable: []uint64{701},
FabricTrafficClassInteractive: []uint64{801},
FabricTrafficClassBulk: []uint64{901},
},
}
dns := testDNSIPv4PacketForFabricRuntime()
tcpControl := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51001, 3389)
tcpControl[33] = 0x02
if err := transport.SendGatewayPacketBatch(context.Background(), [][]byte{dns, tcpControl}); err != nil {
t.Fatalf("send opaque packets: %v", err)
}
if len(sender.frames) != 1 {
t.Fatalf("frames = %d, want one opaque bulk frame", len(sender.frames))
}
if sender.frames[0].TrafficClass != fabricproto.TrafficClassBulk || sender.frames[0].StreamID != 901 {
t.Fatalf("opaque packets should use bulk stream without protocol analysis: %+v", sender.frames[0])
}
payload, err := DecodeFabricVPNPacketDataFrame(sender.frames[0])
if err != nil {
t.Fatalf("decode opaque frame: %v", err)
}
if len(payload.Packets) != 2 {
t.Fatalf("opaque frame packets = %d, want 2", len(payload.Packets))
}
}
func TestFabricSessionPacketPeerRegistryKeepsServiceTunnelFromHello(t *testing.T) {
registry := NewFabricSessionPacketPeerRegistry()
sender := &recordingFrameSender{}
frame, err := NewFabricVPNSessionHelloFrame(FabricVPNPacketFrameInput{
StreamID: 711,
VPNConnectionID: "fabric-tunnel-1",
Direction: FabricDirectionClientToGateway,
TrafficClass: FabricTrafficClassInteractive,
ServiceTunnel: FabricServiceTunnel{
TunnelID: "fabric-tunnel-1",
PoolID: "home-ipv4",
ServiceID: "svc-vpn-1",
ServiceKind: "ipv4-tunnel",
ServiceClass: "vpn_packets",
ServiceRole: "ipv4-egress",
RouteLeaseID: "lease-1",
RouteGeneration: "route-gen-1",
},
})
if err != nil {
t.Fatalf("hello frame: %v", err)
}
handled, err := registry.RegisterFrame(context.Background(), sender, frame)
if err != nil || !handled {
t.Fatalf("register hello handled=%v err=%v", handled, err)
}
snapshot := registry.Snapshot()
peers := snapshot["peers"].([]map[string]any)
if len(peers) != 1 {
t.Fatalf("peers = %+v", peers)
}
serviceTunnel := peers[0]["service_tunnel"].(map[string]any)
if serviceTunnel["pool_id"] != "home-ipv4" ||
serviceTunnel["service_id"] != "svc-vpn-1" ||
serviceTunnel["route_visibility"] != DefaultFabricRouteVisibility ||
serviceTunnel["route_lease_id"] != "lease-1" ||
serviceTunnel["route_generation"] != "route-gen-1" {
t.Fatalf("peer service tunnel not preserved: %+v", serviceTunnel)
}
}
func TestFabricSessionPacketTransportRegistersServiceStreams(t *testing.T) {
t.Skip("retired: base VPN fabric channel is opaque and no longer derives service stream class from packet contents")
sender := &captureFabricSessionSender{}
registry := NewFabricServiceStreamRegistry()
transport := &FabricSessionPacketTransport{
Sender: sender,
TunnelID: "fabric-tunnel-1",
VPNConnectionID: "legacy-vpn-1",
ServiceID: "svc-vpn-1",
SendDirection: FabricDirectionClientToGateway,
ServiceStreams: registry,
StreamIDsByTrafficClass: map[string][]uint64{
FabricTrafficClassInteractive: []uint64{801},
FabricTrafficClassBulk: []uint64{901},
},
ServiceTunnel: FabricServiceTunnel{
TunnelID: "fabric-tunnel-1",
PoolID: "ipv4-egress",
ServiceID: "svc-vpn-1",
ServiceKind: "ipv4-tunnel",
ServiceClass: "vpn_packets",
},
}
packet := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51001, 3389)
packet[33] = 0x02
if err := transport.SendGatewayPacketBatch(context.Background(), [][]byte{packet}); err != nil {
t.Fatalf("send packet: %v", err)
}
streams := registry.StreamsForTunnel("fabric-tunnel-1")
if len(streams) != 1 {
t.Fatalf("registered streams = %+v, want one", streams)
}
if streams[0].StreamID != 801 ||
streams[0].TrafficClass != FabricTrafficClassInteractive ||
streams[0].ServiceID != "svc-vpn-1" ||
streams[0].State != FabricServiceStreamStateOpen {
t.Fatalf("unexpected service stream: %+v", streams[0])
}
snapshot := transport.Snapshot()
serviceStreams, ok := snapshot["service_streams"].([]map[string]any)
if !ok || len(serviceStreams) != 1 || serviceStreams[0]["stream_id"] != uint64(801) {
t.Fatalf("transport snapshot missing service streams: %+v", snapshot["service_streams"])
}
if err := transport.Close(); err != nil {
t.Fatalf("close transport: %v", err)
}
streams = registry.StreamsForTunnel("fabric-tunnel-1")
if len(streams) != 1 || streams[0].State != FabricServiceStreamStateClosed {
t.Fatalf("service stream not closed with transport: %+v", streams)
}
}
func TestFabricSessionPacketTransportUpdatesRouteLeaseWithoutChangingTunnel(t *testing.T) {
transport := &FabricSessionPacketTransport{
TunnelID: "fabric-tunnel-1",
ServiceTunnel: FabricServiceTunnel{
TunnelID: "fabric-tunnel-1",
PoolID: "home-ipv4",
ServiceID: "svc-vpn-1",
RouteLeaseID: "lease-1",
RouteGeneration: "route-gen-1",
},
}
changed, err := transport.UpdateServiceTunnel(FabricServiceTunnel{
TunnelID: "fabric-tunnel-1",
PoolID: "home-ipv4",
ServiceID: "svc-vpn-1",
RouteLeaseID: "lease-2",
RouteGeneration: "route-gen-2",
})
if err != nil || !changed {
t.Fatalf("update service tunnel changed=%v err=%v", changed, err)
}
snapshot := transport.Snapshot()
if snapshot["tunnel_id"] != "fabric-tunnel-1" ||
snapshot["route_lease_id"] != "lease-2" ||
snapshot["route_generation"] != "route-gen-2" ||
snapshot["route_transition_count"] != uint64(1) {
t.Fatalf("route lease update not reflected without tunnel change: %+v", snapshot)
}
if _, err := transport.UpdateServiceTunnel(FabricServiceTunnel{TunnelID: "other-tunnel"}); err == nil {
t.Fatal("expected changing tunnel id to be rejected")
}
}
func TestFabricSessionPacketTransportRoutesDNSOnReliableClass(t *testing.T) {
t.Skip("retired: base VPN fabric channel is opaque and no longer detects DNS packets")
sender := &captureFabricSessionSender{}
registry := NewFabricServiceStreamRegistry()
transport := &FabricSessionPacketTransport{
Sender: sender,
TunnelID: "fabric-tunnel-1",
VPNConnectionID: "legacy-vpn-1",
SendDirection: FabricDirectionClientToGateway,
ServiceStreams: registry,
StreamIDsByTrafficClass: map[string][]uint64{
FabricTrafficClassReliable: []uint64{701},
FabricTrafficClassBulk: []uint64{901},
},
}
if err := transport.SendGatewayPacketBatch(context.Background(), [][]byte{testDNSIPv4PacketForFabricRuntime()}); err != nil {
t.Fatalf("send dns packet: %v", err)
}
if len(sender.frames) != 1 {
t.Fatalf("frames = %d, want 1", len(sender.frames))
}
if sender.frames[0].StreamID != 701 || sender.frames[0].TrafficClass != fabricproto.TrafficClassReliable {
t.Fatalf("dns packet should use reliable stream: %+v", sender.frames[0])
}
streams := registry.StreamsForTunnel("fabric-tunnel-1")
if len(streams) != 1 || streams[0].TrafficClass != FabricTrafficClassDNS {
t.Fatalf("dns service stream not tracked separately: %+v", streams)
}
}
func TestFabricSessionPacketTransportSplitsMixedBatchByStream(t *testing.T) {
t.Skip("retired: base VPN fabric channel is opaque and no longer splits batches by packet protocol")
sender := &captureFabricSessionSender{}
transport := &FabricSessionPacketTransport{
Sender: sender,
@@ -470,15 +709,91 @@ func TestFabricSessionPacketPeerTransportSendsReplyToLatestRegisteredPeer(t *tes
}
}
func TestFabricSessionPacketPeerTransportForgetsClosedPeerAndRebinds(t *testing.T) {
inbox := NewFabricPacketInbox(4)
registry := NewFabricSessionPacketPeerRegistry()
firstSender := &recordingFrameSender{err: errors.New("closed")}
registerFabricSessionPeerForTest(t, registry, firstSender, "vpn-1", 7)
transport := &FabricSessionPacketPeerTransport{
Registry: registry,
Inbox: inbox,
VPNConnectionID: "vpn-1",
PeerWaitTimeout: 250 * time.Millisecond,
}
if err := transport.SendGatewayPacketBatch(context.Background(), [][]byte{[]byte("reply-1")}); err == nil {
t.Fatal("send through closed peer succeeded")
}
if ready := registry.TransportFor("vpn-1", inbox); ready != nil {
t.Fatal("closed peer remained registered")
}
secondSender := &recordingFrameSender{}
go func() {
time.Sleep(25 * time.Millisecond)
registerFabricSessionPeerForTest(t, registry, secondSender, "vpn-1", 11)
}()
if err := transport.SendGatewayPacketBatch(context.Background(), [][]byte{[]byte("reply-2")}); err != nil {
t.Fatalf("send after peer rebind: %v", err)
}
if len(secondSender.frames) != 1 {
t.Fatalf("second sender frames = %d, want 1", len(secondSender.frames))
}
payload, err := DecodeFabricVPNPacketDataFrame(secondSender.frames[0])
if err != nil {
t.Fatalf("decode rebound reply: %v", err)
}
if string(payload.Packets[0]) != "reply-2" {
t.Fatalf("rebound payload = %+v", payload)
}
}
func TestFabricSessionPacketPeerTransportFailsFastWithoutPeer(t *testing.T) {
inbox := NewFabricPacketInbox(4)
registry := NewFabricSessionPacketPeerRegistry()
transport := &FabricSessionPacketPeerTransport{
Registry: registry,
Inbox: inbox,
VPNConnectionID: "vpn-1",
PeerWaitTimeout: 20 * time.Millisecond,
}
startedAt := time.Now()
err := transport.SendGatewayPacketBatch(context.Background(), [][]byte{[]byte("reply")})
if err == nil {
t.Fatal("send without peer succeeded")
}
if elapsed := time.Since(startedAt); elapsed > 250*time.Millisecond {
t.Fatalf("send without peer took %s, want fast failure", elapsed)
}
}
type recordingFrameSender struct {
err error
frames []fabricproto.Frame
}
func (s *recordingFrameSender) SendFrame(_ context.Context, frame fabricproto.Frame) error {
if s.err != nil {
return s.err
}
s.frames = append(s.frames, frame)
return nil
}
func registerFabricSessionPeerForTest(t *testing.T, registry *FabricSessionPacketPeerRegistry, sender FabricSessionFrameWriter, vpnConnectionID string, streamID uint64) {
t.Helper()
frame, err := NewFabricVPNSessionHelloFrame(FabricVPNPacketFrameInput{
StreamID: streamID,
VPNConnectionID: vpnConnectionID,
Direction: FabricDirectionClientToGateway,
})
if err != nil {
t.Fatalf("hello frame: %v", err)
}
handled, err := registry.RegisterFrame(context.Background(), sender, frame)
if err != nil || !handled {
t.Fatalf("register peer handled=%v err=%v", handled, err)
}
}
func TestFabricSessionPacketTransportReceiveReadsPumpFrames(t *testing.T) {
inbox := NewFabricPacketInbox(4)
receiver := memoryFabricSessionReceiver{
@@ -684,7 +999,7 @@ func TestFabricPacketInboxReceivesFabricSessionFrame(t *testing.T) {
}
}
func TestFabricVPNPacketDataFrameInfersInteractiveTCPControl(t *testing.T) {
func TestFabricVPNPacketDataFrameKeepsExplicitBulkForTCPControlContents(t *testing.T) {
packet := testIPv4TCPPacket([4]byte{192, 168, 200, 95}, [4]byte{10, 77, 0, 2}, 3389, 57032)
packet[33] = 0x12
frame, err := NewFabricVPNPacketDataFrame(FabricVPNPacketFrameInput{
@@ -698,12 +1013,13 @@ func TestFabricVPNPacketDataFrameInfersInteractiveTCPControl(t *testing.T) {
if err != nil {
t.Fatalf("new fabric vpn frame: %v", err)
}
if frame.TrafficClass != fabricproto.TrafficClassInteractive {
t.Fatalf("traffic class = %v, want interactive", frame.TrafficClass)
if frame.TrafficClass != fabricproto.TrafficClassBulk {
t.Fatalf("traffic class = %v, want opaque bulk", frame.TrafficClass)
}
}
func TestFabricPacketInboxPrioritizesGatewayTCPControlPackets(t *testing.T) {
t.Skip("retired: base VPN fabric channel preserves arrival order and no longer prioritizes TCP control packets")
inbox := NewFabricPacketInbox(4)
normal := testIPv4TCPPacket([4]byte{185, 16, 148, 89}, [4]byte{10, 77, 0, 2}, 443, 56000)
priority := testIPv4TCPPacket([4]byte{192, 168, 200, 95}, [4]byte{10, 77, 0, 2}, 3389, 57032)
@@ -726,6 +1042,7 @@ func TestFabricPacketInboxPrioritizesGatewayTCPControlPackets(t *testing.T) {
}
func TestFabricPacketInboxWaitsBrieflyForGatewayTCPControlPackets(t *testing.T) {
t.Skip("retired: base VPN fabric channel preserves arrival order and no longer waits for TCP control packets")
inbox := NewFabricPacketInbox(4)
normal := testIPv4TCPPacket([4]byte{185, 16, 148, 89}, [4]byte{10, 77, 0, 2}, 443, 56000)
priority := testIPv4TCPPacket([4]byte{192, 168, 200, 95}, [4]byte{10, 77, 0, 2}, 3389, 57032)
@@ -774,6 +1091,7 @@ func TestLocalPacketTransportUsesFabricInboxDirections(t *testing.T) {
}
func TestFabricFlowSchedulerKeepsReverseFiveTupleTogether(t *testing.T) {
t.Skip("retired: base VPN fabric channel uses opaque packet sharding instead of inspecting 5-tuples")
scheduler := NewFabricFlowScheduler(8, 8)
forward := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
reverse := testIPv4TCPPacket([4]byte{192, 168, 200, 95}, [4]byte{10, 77, 0, 2}, 3389, 51000)
@@ -826,6 +1144,18 @@ func TestFabricFlowSchedulerPrioritizesExplicitTrafficClass(t *testing.T) {
}
}
func TestFabricFlowSchedulerUsesOpaquePacketHashClassifier(t *testing.T) {
scheduler := NewFabricFlowScheduler(8, 0)
packet := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
batches := scheduler.ScheduleClientPacketsForConnection("vpn-1", [][]byte{packet})
if len(batches) != 1 {
t.Fatalf("batches = %d, want 1", len(batches))
}
if batches[0].Classifier != "opaque_packet_hash" || !strings.HasPrefix(batches[0].FlowID, "opaque:") {
t.Fatalf("scheduler should not expose protocol-derived flow keys: %+v", batches[0])
}
}
func TestFabricFlowSchedulerDropsWhenChannelQueueIsFull(t *testing.T) {
scheduler := NewFabricFlowScheduler(1, 1)
packetA := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
@@ -1032,7 +1362,7 @@ func TestFabricClientPacketIngressUsesLeasePreferredRouteBeforeConfigOrder(t *te
}
}
func TestFabricClientPacketIngressTriesAlternateRouteBeforeBackendFallback(t *testing.T) {
func TestFabricClientPacketIngressTriesAlternateRouteBeforeCompatFallback(t *testing.T) {
transport := &failoverProductionTransport{failNextHop: "relay-bad"}
ingress := &FabricClientPacketIngress{
ForwardTransport: transport,
@@ -2617,10 +2947,10 @@ func TestFabricClientPacketIngressBoundedLoadReportsPerChannelDrops(t *testing.T
func TestFabricClientPacketIngressUsesLocalGatewayShortcutWithoutRoute(t *testing.T) {
inbox := NewFabricPacketInbox(4)
ingress := &FabricClientPacketIngress{
Inbox: inbox,
ClusterID: "cluster-1",
LocalNodeID: "entry-1",
AllowLegacyLocalGatewayFallback: true,
Inbox: inbox,
ClusterID: "cluster-1",
LocalNodeID: "entry-1",
AllowLocalGatewayBypass: true,
LocalGateway: func(vpnConnectionID string) bool {
return vpnConnectionID == "vpn-1"
},
@@ -2642,10 +2972,10 @@ func TestFabricClientPacketIngressUsesLocalGatewayShortcutWithoutRoute(t *testin
func TestFabricClientPacketIngressReceivesLocalGatewayReplyWithoutRoute(t *testing.T) {
inbox := NewFabricPacketInbox(4)
ingress := &FabricClientPacketIngress{
Inbox: inbox,
ClusterID: "cluster-1",
LocalNodeID: "entry-1",
AllowLegacyLocalGatewayFallback: true,
Inbox: inbox,
ClusterID: "cluster-1",
LocalNodeID: "entry-1",
AllowLocalGatewayBypass: true,
LocalGateway: func(vpnConnectionID string) bool {
return vpnConnectionID == "vpn-1"
},
@@ -2705,6 +3035,24 @@ func packetSourcePort(packet []byte) uint16 {
return uint16(packet[20])<<8 | uint16(packet[21])
}
func testDNSIPv4PacketForFabricRuntime() []byte {
packet := make([]byte, 28)
packet[0] = 0x45
packet[2] = 0
packet[3] = byte(len(packet))
packet[8] = 64
packet[9] = 17
copy(packet[12:16], []byte{10, 77, 0, 2})
copy(packet[16:20], []byte{1, 1, 1, 1})
packet[20] = 0xc0
packet[21] = 0x00
packet[22] = 0x00
packet[23] = 0x35
packet[24] = 0
packet[25] = 8
return packet
}
func testFlowChannelID(vpnConnectionID string, packet []byte, shardCount int) string {
return fabricFlowChannelID(vpnConnectionID, packetShard(packet, shardCount))
}
@@ -18,6 +18,7 @@ type Gateway struct {
Transport PacketTransport
ClusterID string
VPNConnectionID string
ServiceTunnel FabricServiceTunnel
InterfaceName string
AddressCIDR string
RouteCIDR string
@@ -73,20 +74,6 @@ type packetTransportCloser interface {
Close() error
}
type BackendPacketTransport struct {
API *client.Client
ClusterID string
VPNConnectionID string
}
func (t BackendPacketTransport) SendGatewayPacketBatch(ctx context.Context, packets [][]byte) error {
return t.API.SendVPNGatewayPacketBatch(ctx, t.ClusterID, t.VPNConnectionID, packets)
}
func (t BackendPacketTransport) ReceiveGatewayPacketBatch(ctx context.Context, timeout time.Duration) ([][]byte, error) {
return t.API.ReceiveVPNGatewayPacketBatch(ctx, t.ClusterID, t.VPNConnectionID, timeout)
}
func (g *Gateway) EnsureStarted(ctx context.Context) error {
g.mu.Lock()
if g.running {
@@ -120,7 +107,7 @@ func (g *Gateway) EnsureStarted(ctx context.Context) error {
go func() {
if err := g.run(runCtx, tun); err != nil && runCtx.Err() == nil {
log.Printf("vpn gateway runtime stopped: vpn_connection_id=%s error=%v", g.VPNConnectionID, err)
log.Printf("vpn gateway runtime stopped: tunnel_id=%s error=%v", g.tunnelID(), err)
g.setStopped(err)
return
}
@@ -152,7 +139,8 @@ func (g *Gateway) Status() (bool, string) {
func (g *Gateway) IsReadyForConnection(vpnConnectionID string) bool {
g.mu.Lock()
defer g.mu.Unlock()
return g.running && g.VPNConnectionID == vpnConnectionID && vpnConnectionID != ""
tunnelID := g.tunnelIDLocked()
return g.running && (g.VPNConnectionID == vpnConnectionID || tunnelID == vpnConnectionID) && vpnConnectionID != ""
}
func (g *Gateway) Snapshot() map[string]any {
@@ -169,8 +157,14 @@ func (g *Gateway) Snapshot() map[string]any {
out := map[string]any{
"running": running,
"service_role": "ipv4-egress",
"service_class": "vpn_packets",
"tunnel_id": g.ServiceTunnel.TunnelID,
"pool_id": g.ServiceTunnel.PoolID,
"service_id": g.ServiceTunnel.ServiceID,
"local_service_id": g.ServiceTunnel.LocalServiceID,
"remote_service_id": g.ServiceTunnel.RemoteServiceID,
"service_kind": g.ServiceTunnel.ServiceKind,
"service_role": firstNonEmptyTunnelString(g.ServiceTunnel.ServiceRole, DefaultFabricTunnelRole),
"service_class": firstNonEmptyTunnelString(g.ServiceTunnel.ServiceClass, DefaultFabricTunnelClass),
"adapter_contract": "fabric_channel_to_ipv4_nat",
"transport": g.transportName(),
"poll_timeout_ms": g.PollTimeout.Milliseconds(),
@@ -196,6 +190,7 @@ func (g *Gateway) Snapshot() map[string]any {
if !lastRuntimeActivityAt.IsZero() {
out["last_runtime_activity_at"] = lastRuntimeActivityAt.UTC().Format(time.RFC3339Nano)
}
out["service_tunnel"] = g.ServiceTunnel.Snapshot()
if platform := gatewayPlatformSnapshot(g.InterfaceName, g.RouteCIDR); len(platform) > 0 {
out["platform"] = platform
}
@@ -216,9 +211,7 @@ func (g *Gateway) transportName() string {
case *LocalPacketTransport:
return "local_fabric_inbox"
case *AdaptivePacketTransport:
return "adaptive_fabric_backend"
case BackendPacketTransport:
return "backend_http_packet_relay"
return "adaptive_fabric"
default:
if g.Transport == nil {
return "none"
@@ -237,10 +230,14 @@ func (g *Gateway) setStopped(err error) {
func (g *Gateway) normalize() error {
if g.Transport == nil {
return fmt.Errorf("fabric packet transport is required; backend packet relay fallback is disabled")
return fmt.Errorf("fabric packet transport is required")
}
g.ServiceTunnel = NormalizeServiceTunnel(g.ServiceTunnel, g.VPNConnectionID)
if g.VPNConnectionID == "" {
g.VPNConnectionID = g.ServiceTunnel.TunnelID
}
if g.ClusterID == "" || g.VPNConnectionID == "" {
return fmt.Errorf("cluster id and vpn connection id are required")
return fmt.Errorf("cluster id and tunnel id are required")
}
if g.InterfaceName == "" {
g.InterfaceName = "rapvpn0"
@@ -257,6 +254,19 @@ func (g *Gateway) normalize() error {
return nil
}
func (g *Gateway) tunnelIDLocked() string {
return firstNonEmptyTunnelString(g.ServiceTunnel.TunnelID, g.VPNConnectionID)
}
func (g *Gateway) tunnelID() string {
if g == nil {
return ""
}
g.mu.Lock()
defer g.mu.Unlock()
return g.tunnelIDLocked()
}
func (g *Gateway) run(ctx context.Context, tun readWriteCloser) error {
defer tun.Close()
if closer, ok := g.Transport.(packetTransportCloser); ok {
@@ -279,11 +289,10 @@ func (g *Gateway) run(ctx context.Context, tun readWriteCloser) error {
}
func (g *Gateway) copyGatewayToClient(ctx context.Context, tun io.Reader) error {
priorityPackets := make(chan []byte, 1024)
packets := make(chan []byte, 32768)
errCh := make(chan error, 1)
go func() {
errCh <- g.uploadGatewayPackets(ctx, priorityPackets, packets)
errCh <- g.uploadGatewayPackets(ctx, nil, packets)
}()
buffer := make([]byte, 65535)
@@ -307,25 +316,16 @@ func (g *Gateway) copyGatewayToClient(ctx context.Context, tun io.Reader) error
packet := append([]byte(nil), buffer[:n]...)
normalizeIPv4PacketChecksums(packet)
g.recordTunRead(packet)
if isTCPControlPacket(packet) {
select {
case priorityPackets <- packet:
default:
g.uploadQueueDrops.Add(1)
log.Printf("vpn gateway priority packet upload queue full; dropping packet: vpn_connection_id=%s", g.VPNConnectionID)
}
continue
}
select {
case packets <- packet:
default:
g.uploadQueueDrops.Add(1)
log.Printf("vpn gateway packet upload queue full; dropping packet: vpn_connection_id=%s", g.VPNConnectionID)
log.Printf("vpn gateway packet upload queue full; dropping packet: tunnel_id=%s", g.tunnelID())
}
}
}
func (g *Gateway) uploadGatewayPackets(ctx context.Context, priorityPackets <-chan []byte, packets <-chan []byte) error {
func (g *Gateway) uploadGatewayPackets(ctx context.Context, _ <-chan []byte, packets <-chan []byte) error {
batch := make([][]byte, 0, vpnGatewayBatchMaxPackets)
batchBytes := 0
timer := time.NewTimer(time.Hour)
@@ -341,7 +341,7 @@ func (g *Gateway) uploadGatewayPackets(ctx context.Context, priorityPackets <-ch
byteCount := packetBytesTotal(batch)
if err := g.Transport.SendGatewayPacketBatch(ctx, batch); err != nil {
g.uploadErrors.Add(1)
log.Printf("vpn gateway packet batch upload failed: vpn_connection_id=%s packets=%d error=%v", g.VPNConnectionID, len(batch), err)
log.Printf("vpn gateway packet batch upload failed: tunnel_id=%s packets=%d error=%v", g.tunnelID(), len(batch), err)
} else {
g.recordGatewayToClientBatch(packetCount, byteCount, batch[0])
}
@@ -366,50 +366,6 @@ func (g *Gateway) uploadGatewayPackets(ctx context.Context, priorityPackets <-ch
batchBytes += packetFrameSize
return true
}
flushPriority := func(packet []byte) {
pendingBatch := batch
pendingBatchBytes := batchBytes
batch = make([][]byte, 0, vpnGatewayBatchMaxPackets)
batchBytes = 0
if !addPacket(packet) {
batch = pendingBatch
batchBytes = pendingBatchBytes
return
}
deadline := time.Now().Add(vpnGatewayPriorityBatchWait)
for len(batch) < vpnGatewayBatchMaxPackets && batchBytes < vpnGatewayBatchMaxBytes {
wait := time.Until(deadline)
if wait <= 0 {
break
}
timer := time.NewTimer(wait)
select {
case next := <-priorityPackets:
if !timer.Stop() {
select {
case <-timer.C:
default:
}
}
if !addPacket(next) {
flush()
_ = addPacket(next)
}
case <-timer.C:
flush()
return
}
}
flush()
if len(pendingBatch) > 0 {
batch = pendingBatch
batchBytes = pendingBatchBytes
if !timerActive {
timer.Reset(vpnGatewayBatchFlushTimeout)
timerActive = true
}
}
}
for {
if len(batch) == 0 && timerActive {
if !timer.Stop() {
@@ -421,17 +377,9 @@ func (g *Gateway) uploadGatewayPackets(ctx context.Context, priorityPackets <-ch
timerActive = false
}
select {
case packet := <-priorityPackets:
flushPriority(packet)
continue
default:
}
select {
case <-ctx.Done():
flush()
return ctx.Err()
case packet := <-priorityPackets:
flushPriority(packet)
case packet := <-packets:
if !addPacket(packet) {
continue
@@ -451,23 +399,11 @@ func (g *Gateway) uploadGatewayPackets(ctx context.Context, priorityPackets <-ch
}
}
func isTCPControlPacket(packet []byte) bool {
if len(packet) < 20 || packet[0]>>4 != 4 {
return false
}
ihl := int(packet[0]&0x0f) * 4
if ihl < 20 || len(packet) < ihl+20 || packet[9] != 6 {
return false
}
flags := packet[ihl+13]
return flags&0x17 != 0
}
func (g *Gateway) copyClientToGateway(ctx context.Context, tun io.Writer) error {
for {
packets, err := g.Transport.ReceiveGatewayPacketBatch(ctx, g.PollTimeout)
if err != nil {
log.Printf("vpn gateway packet download failed: vpn_connection_id=%s error=%v", g.VPNConnectionID, err)
log.Printf("vpn gateway packet download failed: tunnel_id=%s error=%v", g.tunnelID(), err)
select {
case <-ctx.Done():
return ctx.Err()
@@ -501,8 +437,8 @@ func (g *Gateway) recordClientToGatewayBatch(packetCount int, byteCount int, fir
g.mu.Unlock()
if next <= 5 {
log.Printf(
"vpn gateway client_to_gateway batch received: vpn_connection_id=%s batch=%d packets=%d bytes=%d first=%s",
g.VPNConnectionID,
"vpn gateway client_to_gateway batch received: tunnel_id=%s batch=%d packets=%d bytes=%d first=%s",
g.tunnelID(),
next,
packetCount,
byteCount,
@@ -522,8 +458,8 @@ func (g *Gateway) recordGatewayToClientBatch(packetCount int, byteCount int, fir
g.mu.Unlock()
if next <= 5 {
log.Printf(
"vpn gateway gateway_to_client batch uploaded: vpn_connection_id=%s batch=%d packets=%d bytes=%d first=%s",
g.VPNConnectionID,
"vpn gateway gateway_to_client batch uploaded: tunnel_id=%s batch=%d packets=%d bytes=%d first=%s",
g.tunnelID(),
next,
packetCount,
byteCount,
@@ -536,7 +472,7 @@ func (g *Gateway) recordTunWrite(packet []byte) {
next := g.tunWritePackets.Add(1)
g.tunWriteBytes.Add(uint64(len(packet)))
if next <= 5 {
log.Printf("vpn gateway packet written to tun: vpn_connection_id=%s packet=%d bytes=%d summary=%s", g.VPNConnectionID, next, len(packet), summarizePacket(packet))
log.Printf("vpn gateway packet written to tun: tunnel_id=%s packet=%d bytes=%d summary=%s", g.tunnelID(), next, len(packet), summarizePacket(packet))
}
}
@@ -544,7 +480,7 @@ func (g *Gateway) recordTunRead(packet []byte) {
next := g.tunReadPackets.Add(1)
g.tunReadBytes.Add(uint64(len(packet)))
if next <= 5 {
log.Printf("vpn gateway packet read from tun: vpn_connection_id=%s packet=%d bytes=%d summary=%s", g.VPNConnectionID, next, len(packet), summarizePacket(packet))
log.Printf("vpn gateway packet read from tun: tunnel_id=%s packet=%d bytes=%d summary=%s", g.tunnelID(), next, len(packet), summarizePacket(packet))
}
}
@@ -95,7 +95,7 @@ func TestGatewayRunClosesPacketTransportOnRuntimeError(t *testing.T) {
}
}
func TestGatewayNormalizeRejectsBackendPacketRelayFallback(t *testing.T) {
func TestGatewayNormalizeRequiresFabricPacketTransport(t *testing.T) {
gateway := &Gateway{
API: nil,
ClusterID: "cluster-1",
@@ -106,7 +106,7 @@ func TestGatewayNormalizeRejectsBackendPacketRelayFallback(t *testing.T) {
if err == nil {
t.Fatal("normalize succeeded without a fabric packet transport")
}
if got, want := err.Error(), "fabric packet transport is required; backend packet relay fallback is disabled"; got != want {
if got, want := err.Error(), "fabric packet transport is required"; got != want {
t.Fatalf("normalize error = %q, want %q", got, want)
}
}
@@ -120,6 +120,7 @@ func TestGatewaySnapshotReportsIPv4EgressServiceAdapter(t *testing.T) {
}
func TestGatewayUploadPrioritizesTCPControlPackets(t *testing.T) {
t.Skip("retired: base VPN gateway uploads opaque packet batches without TCP control prioritization")
transport := &recordingGatewayTransport{}
gateway := &Gateway{Transport: transport, VPNConnectionID: "vpn-1"}
priorityPackets := make(chan []byte, 1)
@@ -160,6 +161,7 @@ func TestGatewayUploadPrioritizesTCPControlPackets(t *testing.T) {
}
func TestGatewayUploadPreemptsPendingNormalBatchForTCPControlPackets(t *testing.T) {
t.Skip("retired: base VPN gateway preserves packet batch order instead of preempting by TCP flags")
transport := &recordingGatewayTransport{}
gateway := &Gateway{Transport: transport, VPNConnectionID: "vpn-1"}
priorityPackets := make(chan []byte, 1)
@@ -201,6 +203,7 @@ func TestGatewayUploadPreemptsPendingNormalBatchForTCPControlPackets(t *testing.
}
func TestGatewayUploadMicroBatchesTCPControlPackets(t *testing.T) {
t.Skip("retired: base VPN gateway no longer creates protocol-specific TCP control microbatches")
transport := &recordingGatewayTransport{}
gateway := &Gateway{Transport: transport, VPNConnectionID: "vpn-1"}
priorityPackets := make(chan []byte, 2)
@@ -239,18 +242,3 @@ func TestGatewayUploadMicroBatchesTCPControlPackets(t *testing.T) {
}
}
}
func TestIsTCPControlPacket(t *testing.T) {
packet := testIPv4TCPPacket([4]byte{192, 168, 200, 95}, [4]byte{10, 77, 0, 2}, 3389, 51000)
if isTCPControlPacket(packet) {
t.Fatal("packet without control flags was classified as control")
}
packet[33] = 0x12
if !isTCPControlPacket(packet) {
t.Fatal("tcp syn-ack was not classified as control")
}
packet[9] = 17
if isTCPControlPacket(packet) {
t.Fatal("udp packet was classified as tcp control")
}
}
@@ -0,0 +1,208 @@
package vpnruntime
import (
"fmt"
"sort"
"sync"
"time"
)
const (
FabricServiceStreamRegistrySchemaVersion = "rap.fabric_service_stream_registry.v1"
FabricServiceStreamStateOpen = "open"
FabricServiceStreamStateClosed = "closed"
FabricServiceStreamStateReset = "reset"
)
type FabricServiceStream struct {
TunnelID string `json:"tunnel_id"`
ServiceID string `json:"service_id"`
StreamID uint64 `json:"stream_id"`
TrafficClass string `json:"traffic_class"`
Direction string `json:"direction,omitempty"`
State string `json:"state"`
ServiceTunnel FabricServiceTunnel `json:"service_tunnel"`
OpenedAt time.Time `json:"opened_at"`
UpdatedAt time.Time `json:"updated_at"`
Metadata map[string]string `json:"metadata,omitempty"`
}
type FabricServiceStreamRegistry struct {
mu sync.RWMutex
streams map[string]FabricServiceStream
}
func NewFabricServiceStreamRegistry() *FabricServiceStreamRegistry {
return &FabricServiceStreamRegistry{streams: map[string]FabricServiceStream{}}
}
func (r *FabricServiceStreamRegistry) Register(stream FabricServiceStream) FabricServiceStream {
if r == nil {
return FabricServiceStream{}
}
now := time.Now().UTC()
stream.ServiceTunnel = NormalizeServiceTunnel(stream.ServiceTunnel, stream.TunnelID)
stream.TunnelID = firstNonEmptyTunnelString(stream.TunnelID, stream.ServiceTunnel.TunnelID)
stream.ServiceID = firstNonEmptyTunnelString(stream.ServiceID, stream.ServiceTunnel.ServiceID)
stream.TrafficClass = normalizeFabricTrafficClass(stream.TrafficClass)
if stream.State == "" {
stream.State = FabricServiceStreamStateOpen
}
if stream.OpenedAt.IsZero() {
stream.OpenedAt = now
}
stream.UpdatedAt = now
r.mu.Lock()
defer r.mu.Unlock()
if r.streams == nil {
r.streams = map[string]FabricServiceStream{}
}
if existing, ok := r.streams[serviceStreamKey(stream.TunnelID, stream.StreamID)]; ok {
if !existing.OpenedAt.IsZero() {
stream.OpenedAt = existing.OpenedAt
}
}
r.streams[serviceStreamKey(stream.TunnelID, stream.StreamID)] = stream
return stream
}
func (r *FabricServiceStreamRegistry) MarkClosed(tunnelID string, streamID uint64) {
r.markState(tunnelID, streamID, FabricServiceStreamStateClosed)
}
func (r *FabricServiceStreamRegistry) MarkReset(tunnelID string, streamID uint64) {
r.markState(tunnelID, streamID, FabricServiceStreamStateReset)
}
func (r *FabricServiceStreamRegistry) StreamsForTunnel(tunnelID string) []FabricServiceStream {
if r == nil || tunnelID == "" {
return nil
}
r.mu.RLock()
defer r.mu.RUnlock()
out := make([]FabricServiceStream, 0)
for _, stream := range r.streams {
if stream.TunnelID == tunnelID {
out = append(out, cloneFabricServiceStream(stream))
}
}
sort.Slice(out, func(i, j int) bool { return out[i].StreamID < out[j].StreamID })
return out
}
func (r *FabricServiceStreamRegistry) Snapshot() map[string]any {
if r == nil {
return map[string]any{"schema_version": FabricServiceStreamRegistrySchemaVersion, "stream_count": 0}
}
r.mu.RLock()
defer r.mu.RUnlock()
items := make([]map[string]any, 0, len(r.streams))
openCount := 0
for _, stream := range r.streams {
if stream.State == FabricServiceStreamStateOpen {
openCount++
}
item := map[string]any{
"tunnel_id": stream.TunnelID,
"service_id": stream.ServiceID,
"stream_id": stream.StreamID,
"traffic_class": stream.TrafficClass,
"direction": stream.Direction,
"state": stream.State,
"service_tunnel": stream.ServiceTunnel.Snapshot(),
}
if !stream.OpenedAt.IsZero() {
item["opened_at"] = stream.OpenedAt.Format(time.RFC3339Nano)
}
if !stream.UpdatedAt.IsZero() {
item["updated_at"] = stream.UpdatedAt.Format(time.RFC3339Nano)
}
if len(stream.Metadata) > 0 {
item["metadata"] = cloneStringMap(stream.Metadata)
}
items = append(items, item)
}
sort.Slice(items, func(i, j int) bool {
left, _ := items[i]["stream_id"].(uint64)
right, _ := items[j]["stream_id"].(uint64)
return left < right
})
return map[string]any{
"schema_version": FabricServiceStreamRegistrySchemaVersion,
"stream_count": len(items),
"open_count": openCount,
"streams": items,
}
}
func (r *FabricServiceStreamRegistry) markState(tunnelID string, streamID uint64, state string) {
if r == nil || tunnelID == "" || streamID == 0 {
return
}
r.mu.Lock()
defer r.mu.Unlock()
key := serviceStreamKey(tunnelID, streamID)
stream, ok := r.streams[key]
if !ok {
return
}
stream.State = state
stream.UpdatedAt = time.Now().UTC()
r.streams[key] = stream
}
func serviceStreamKey(tunnelID string, streamID uint64) string {
return fmt.Sprintf("%s\x00%d", tunnelID, streamID)
}
func cloneFabricServiceStream(stream FabricServiceStream) FabricServiceStream {
stream.Metadata = cloneStringMap(stream.Metadata)
return stream
}
func serviceStreamsSnapshotItems(streams []FabricServiceStream) []map[string]any {
if len(streams) == 0 {
return nil
}
items := make([]map[string]any, 0, len(streams))
for _, stream := range streams {
item := map[string]any{
"tunnel_id": stream.TunnelID,
"service_id": stream.ServiceID,
"stream_id": stream.StreamID,
"traffic_class": stream.TrafficClass,
"direction": stream.Direction,
"state": stream.State,
"service_tunnel": stream.ServiceTunnel.Snapshot(),
}
if !stream.OpenedAt.IsZero() {
item["opened_at"] = stream.OpenedAt.Format(time.RFC3339Nano)
}
if !stream.UpdatedAt.IsZero() {
item["updated_at"] = stream.UpdatedAt.Format(time.RFC3339Nano)
}
if len(stream.Metadata) > 0 {
item["metadata"] = cloneStringMap(stream.Metadata)
}
items = append(items, item)
}
sort.Slice(items, func(i, j int) bool {
left, _ := items[i]["stream_id"].(uint64)
right, _ := items[j]["stream_id"].(uint64)
return left < right
})
return items
}
func cloneStringMap(values map[string]string) map[string]string {
if len(values) == 0 {
return nil
}
out := make(map[string]string, len(values))
for key, value := range values {
out[key] = value
}
return out
}
@@ -0,0 +1,44 @@
package vpnruntime
import "testing"
func TestFabricServiceStreamRegistryTracksTunnelScopedStreams(t *testing.T) {
registry := NewFabricServiceStreamRegistry()
stream := registry.Register(FabricServiceStream{
TunnelID: "tunnel-1",
ServiceID: "svc-1",
StreamID: 42,
TrafficClass: FabricServiceTrafficInteractive,
Direction: FabricDirectionClientToGateway,
ServiceTunnel: FabricServiceTunnel{
TunnelID: "tunnel-1",
PoolID: "pool-vpn",
ServiceID: "svc-1",
ServiceKind: "ipv4-tunnel",
},
Metadata: map[string]string{"adapter": "vpn"},
})
if stream.State != FabricServiceStreamStateOpen {
t.Fatalf("stream state = %q, want open", stream.State)
}
if stream.ServiceTunnel.TransportOwner != DefaultFabricTransportOwner {
t.Fatalf("service tunnel should remain fabric-owned: %+v", stream.ServiceTunnel)
}
streams := registry.StreamsForTunnel("tunnel-1")
if len(streams) != 1 || streams[0].StreamID != 42 || streams[0].ServiceID != "svc-1" {
t.Fatalf("streams for tunnel = %+v", streams)
}
registry.MarkClosed("tunnel-1", 42)
streams = registry.StreamsForTunnel("tunnel-1")
if len(streams) != 1 || streams[0].State != FabricServiceStreamStateClosed {
t.Fatalf("closed stream not tracked: %+v", streams)
}
snapshot := registry.Snapshot()
if snapshot["schema_version"] != FabricServiceStreamRegistrySchemaVersion ||
snapshot["stream_count"] != 1 ||
snapshot["open_count"] != 0 {
t.Fatalf("unexpected registry snapshot: %+v", snapshot)
}
}
@@ -0,0 +1,179 @@
package vpnruntime
import "strings"
const (
DefaultFabricTunnelPoolID = "ipv4-egress"
DefaultFabricTunnelServiceKind = "ipv4-tunnel"
DefaultFabricTunnelClass = "vpn_packets"
DefaultFabricTunnelRole = "ipv4-egress"
DefaultFabricTunnelDataPlane = "fabric_quic_streams"
DefaultFabricTransportOwner = "fabric_farm"
DefaultFabricRouteVisibility = "opaque_to_service"
FabricServiceTunnelSchemaVersion = "rap.fabric_service_tunnel.v1"
FabricServiceTrafficControl = "control"
FabricServiceTrafficDNS = "dns"
FabricServiceTrafficInteractive = "interactive"
FabricServiceTrafficReliable = "reliable"
FabricServiceTrafficBulk = "bulk"
FabricServiceTrafficDroppable = "droppable"
DefaultFabricServiceStreamShards = 8
)
type FabricServiceTunnel struct {
TunnelID string `json:"tunnel_id"`
PoolID string `json:"pool_id"`
ServiceID string `json:"service_id"`
LocalServiceID string `json:"local_service_id"`
RemoteServiceID string `json:"remote_service_id"`
ServiceKind string `json:"service_kind"`
ServiceClass string `json:"service_class"`
ServiceRole string `json:"service_role"`
RouteLeaseID string `json:"route_lease_id,omitempty"`
RouteGeneration string `json:"route_generation,omitempty"`
DataPlane string `json:"data_plane,omitempty"`
TransportOwner string `json:"transport_owner,omitempty"`
RouteVisibility string `json:"route_visibility,omitempty"`
TrafficClasses []string `json:"traffic_classes,omitempty"`
StreamShards int `json:"stream_shards,omitempty"`
}
type FabricServiceTunnelDefaults struct {
PoolID string
ServiceKind string
ServiceClass string
ServiceRole string
DataPlane string
TransportOwner string
RouteVisibility string
TrafficClasses []string
StreamShards int
}
func NormalizeServiceTunnel(tunnel FabricServiceTunnel, fallbackID string) FabricServiceTunnel {
return NormalizeServiceTunnelWithDefaults(tunnel, fallbackID, DefaultVPNServiceTunnelDefaults())
}
func NormalizeServiceTunnelWithDefaults(tunnel FabricServiceTunnel, fallbackID string, defaults FabricServiceTunnelDefaults) FabricServiceTunnel {
defaults = normalizeServiceTunnelDefaults(defaults)
tunnel.TunnelID = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.TunnelID, fallbackID))
tunnel.PoolID = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.PoolID, defaults.PoolID))
tunnel.ServiceID = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.ServiceID, "svc-"+tunnel.TunnelID))
tunnel.LocalServiceID = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.LocalServiceID, "svc-local-"+tunnel.TunnelID))
tunnel.RemoteServiceID = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.RemoteServiceID, "svc-remote-"+tunnel.TunnelID))
tunnel.ServiceKind = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.ServiceKind, defaults.ServiceKind))
tunnel.ServiceClass = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.ServiceClass, defaults.ServiceClass))
tunnel.ServiceRole = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.ServiceRole, defaults.ServiceRole))
tunnel.DataPlane = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.DataPlane, defaults.DataPlane))
tunnel.TransportOwner = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.TransportOwner, defaults.TransportOwner))
tunnel.RouteVisibility = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.RouteVisibility, defaults.RouteVisibility))
tunnel.TrafficClasses = normalizeTunnelTrafficClasses(tunnel.TrafficClasses, defaults.TrafficClasses)
if tunnel.StreamShards <= 0 {
tunnel.StreamShards = defaults.StreamShards
}
return tunnel
}
func (t FabricServiceTunnel) Snapshot() map[string]any {
t = NormalizeServiceTunnelWithDefaults(t, t.TunnelID, FabricServiceTunnelDefaults{
PoolID: t.PoolID,
ServiceKind: t.ServiceKind,
ServiceClass: t.ServiceClass,
ServiceRole: t.ServiceRole,
DataPlane: t.DataPlane,
TransportOwner: t.TransportOwner,
RouteVisibility: t.RouteVisibility,
TrafficClasses: t.TrafficClasses,
StreamShards: t.StreamShards,
})
return map[string]any{
"schema_version": FabricServiceTunnelSchemaVersion,
"tunnel_id": t.TunnelID,
"pool_id": t.PoolID,
"service_id": t.ServiceID,
"local_service_id": t.LocalServiceID,
"remote_service_id": t.RemoteServiceID,
"service_kind": t.ServiceKind,
"service_class": t.ServiceClass,
"service_role": t.ServiceRole,
"route_lease_id": t.RouteLeaseID,
"route_generation": t.RouteGeneration,
"data_plane": t.DataPlane,
"transport_owner": t.TransportOwner,
"route_visibility": t.RouteVisibility,
"traffic_classes": append([]string(nil), t.TrafficClasses...),
"stream_shards": t.StreamShards,
"selected_node_known": false,
}
}
func DefaultVPNServiceTunnelDefaults() FabricServiceTunnelDefaults {
return FabricServiceTunnelDefaults{
PoolID: DefaultFabricTunnelPoolID,
ServiceKind: DefaultFabricTunnelServiceKind,
ServiceClass: DefaultFabricTunnelClass,
ServiceRole: DefaultFabricTunnelRole,
DataPlane: DefaultFabricTunnelDataPlane,
TransportOwner: DefaultFabricTransportOwner,
RouteVisibility: DefaultFabricRouteVisibility,
TrafficClasses: []string{
FabricServiceTrafficControl,
FabricServiceTrafficDNS,
FabricServiceTrafficInteractive,
FabricServiceTrafficReliable,
FabricServiceTrafficBulk,
FabricServiceTrafficDroppable,
},
StreamShards: DefaultFabricServiceStreamShards,
}
}
func normalizeServiceTunnelDefaults(defaults FabricServiceTunnelDefaults) FabricServiceTunnelDefaults {
fallback := DefaultVPNServiceTunnelDefaults()
defaults.PoolID = firstNonEmptyTunnelString(defaults.PoolID, fallback.PoolID)
defaults.ServiceKind = firstNonEmptyTunnelString(defaults.ServiceKind, fallback.ServiceKind)
defaults.ServiceClass = firstNonEmptyTunnelString(defaults.ServiceClass, fallback.ServiceClass)
defaults.ServiceRole = firstNonEmptyTunnelString(defaults.ServiceRole, fallback.ServiceRole)
defaults.DataPlane = firstNonEmptyTunnelString(defaults.DataPlane, fallback.DataPlane)
defaults.TransportOwner = firstNonEmptyTunnelString(defaults.TransportOwner, fallback.TransportOwner)
defaults.RouteVisibility = firstNonEmptyTunnelString(defaults.RouteVisibility, fallback.RouteVisibility)
defaults.TrafficClasses = normalizeTunnelTrafficClasses(defaults.TrafficClasses, fallback.TrafficClasses)
if defaults.StreamShards <= 0 {
defaults.StreamShards = fallback.StreamShards
}
return defaults
}
func normalizeTunnelTrafficClasses(values []string, fallback []string) []string {
if len(values) == 0 {
return append([]string(nil), fallback...)
}
out := make([]string, 0, len(values))
seen := map[string]struct{}{}
for _, value := range values {
value = strings.TrimSpace(value)
if value == "" {
continue
}
if _, ok := seen[value]; ok {
continue
}
seen[value] = struct{}{}
out = append(out, value)
}
if len(out) == 0 {
return append([]string(nil), fallback...)
}
return out
}
func firstNonEmptyTunnelString(values ...string) string {
for _, value := range values {
if trimmed := strings.TrimSpace(value); trimmed != "" {
return trimmed
}
}
return ""
}
@@ -0,0 +1,46 @@
package vpnruntime
import "testing"
func TestNormalizeServiceTunnelKeepsVPNAsProfileNotTransportRule(t *testing.T) {
tunnel := NormalizeServiceTunnel(FabricServiceTunnel{}, "vpn-tunnel-1")
if tunnel.TunnelID != "vpn-tunnel-1" {
t.Fatalf("tunnel id = %q", tunnel.TunnelID)
}
if tunnel.ServiceKind != DefaultFabricTunnelServiceKind || tunnel.ServiceClass != DefaultFabricTunnelClass {
t.Fatalf("vpn defaults not applied: %+v", tunnel)
}
if tunnel.TransportOwner != DefaultFabricTransportOwner || tunnel.RouteVisibility != DefaultFabricRouteVisibility {
t.Fatalf("transport ownership defaults not applied: %+v", tunnel)
}
if tunnel.DataPlane != DefaultFabricTunnelDataPlane || tunnel.StreamShards != DefaultFabricServiceStreamShards {
t.Fatalf("data plane defaults not applied: %+v", tunnel)
}
if len(tunnel.TrafficClasses) < 5 {
t.Fatalf("traffic classes too small: %+v", tunnel.TrafficClasses)
}
}
func TestNormalizeServiceTunnelSupportsNonVPNService(t *testing.T) {
tunnel := NormalizeServiceTunnelWithDefaults(FabricServiceTunnel{}, "rdp-tunnel-1", FabricServiceTunnelDefaults{
PoolID: "desktop-exit",
ServiceKind: "rdp-client",
ServiceClass: "remote_desktop",
ServiceRole: "desktop-egress",
TrafficClasses: []string{
FabricServiceTrafficControl,
FabricServiceTrafficInteractive,
FabricServiceTrafficBulk,
},
StreamShards: 8,
})
if tunnel.TunnelID != "rdp-tunnel-1" || tunnel.PoolID != "desktop-exit" || tunnel.ServiceKind != "rdp-client" {
t.Fatalf("non-vpn tunnel defaults not applied: %+v", tunnel)
}
if tunnel.ServiceClass != "remote_desktop" || tunnel.ServiceRole != "desktop-egress" {
t.Fatalf("non-vpn service identity not applied: %+v", tunnel)
}
if tunnel.StreamShards != 8 || len(tunnel.TrafficClasses) != 3 {
t.Fatalf("non-vpn stream policy not applied: %+v", tunnel)
}
}
@@ -19,8 +19,8 @@ const (
iffNoPI = 0x1000
tunSetIFF = 0x400454ca
ifNameSize = 16
gatewayTunMTU = "1000"
gatewayTCPMSS = "900"
gatewayTunMTU = "1280"
gatewayTCPMSS = "1240"
)
type tunDevice struct {
@@ -77,14 +77,10 @@ func (d AdminRuntimeDispatcher) HandleFabricRequest(ctx context.Context, request
func allowedAdminRuntimeScope(scope string, serviceClass string) bool {
switch serviceClass {
case "platform_admin":
return scope == "platform"
case "cluster_admin":
return scope == "cluster"
case "organization_portal":
return scope == "organization"
case "user_portal":
return scope == "user" || scope == "organization"
case "admin-ingress":
return scope == "platform" || scope == "cluster"
case "public-ingress":
return scope == "organization" || scope == "user"
default:
return false
}
@@ -143,18 +139,22 @@ func (d AdminRuntimeDispatcher) manifest(request FabricRequest) map[string]any {
sections := []string{}
actions := []string{}
switch serviceClass {
case "platform_admin":
case "admin-ingress":
sections = []string{"clusters", "nodes", "roles", "fabric", "workloads", "audit"}
actions = []string{"read_platform_summary", "read_cluster_summaries", "read_node_status"}
case "cluster_admin":
sections = []string{"cluster", "nodes", "fabric", "workloads", "audit"}
actions = []string{"read_cluster_summary", "read_node_status"}
case "organization_portal":
if request.Scope == "cluster" {
sections = []string{"cluster", "nodes", "fabric", "workloads", "audit"}
actions = []string{"read_cluster_summary", "read_node_status"}
} else {
actions = []string{"read_platform_summary", "read_cluster_summaries", "read_node_status"}
}
case "public-ingress":
sections = []string{"organization", "sessions", "resources", "audit"}
actions = []string{"read_organization_summary", "read_sessions"}
case "user_portal":
sections = []string{"profile", "sessions", "resources"}
actions = []string{"read_profile", "read_sessions"}
if request.Scope == "user" {
sections = []string{"profile", "sessions", "resources"}
actions = []string{"read_profile", "read_sessions"}
} else {
actions = []string{"read_organization_summary", "read_sessions"}
}
default:
sections = []string{"status"}
actions = []string{"read_status"}
@@ -14,7 +14,7 @@ func TestAdminRuntimeDispatcherReturnsHealthAndManifest(t *testing.T) {
Method: http.MethodGet,
Path: "/readyz",
Scope: "platform",
ServiceClass: "platform_admin",
ServiceClass: "admin-ingress",
})
if err != nil {
t.Fatalf("health: %v", err)
@@ -25,9 +25,9 @@ func TestAdminRuntimeDispatcherReturnsHealthAndManifest(t *testing.T) {
manifest, err := dispatcher.HandleFabricRequest(context.Background(), FabricRequest{
Method: http.MethodGet,
Path: "/platform-admin/ui-manifest",
Path: "/admin/ui-manifest",
Scope: "platform",
ServiceClass: "platform_admin",
ServiceClass: "admin-ingress",
})
if err != nil {
t.Fatalf("manifest: %v", err)
@@ -51,9 +51,9 @@ func TestAdminRuntimeDispatcherBlocksMutationsAndUnknownProjection(t *testing.T)
mutation, err := dispatcher.HandleFabricRequest(context.Background(), FabricRequest{
Method: http.MethodPost,
Path: "/platform-admin/nodes",
Path: "/admin/nodes",
Scope: "platform",
ServiceClass: "platform_admin",
ServiceClass: "admin-ingress",
})
if err != nil {
t.Fatalf("mutation: %v", err)
@@ -68,9 +68,9 @@ func TestAdminRuntimeDispatcherBlocksMutationsAndUnknownProjection(t *testing.T)
projection, err := dispatcher.HandleFabricRequest(context.Background(), FabricRequest{
Method: http.MethodGet,
Path: "/platform-admin/nodes",
Path: "/admin/nodes",
Scope: "platform",
ServiceClass: "platform_admin",
ServiceClass: "admin-ingress",
})
if err != nil {
t.Fatalf("projection: %v", err)
@@ -88,9 +88,9 @@ func TestAdminRuntimeDispatcherRejectsInvalidScopeClassPair(t *testing.T) {
dispatcher := AdminRuntimeDispatcher{ProjectionClient: &recordingProjectionClient{}, Now: fixedEnvelopeNow}
response, err := dispatcher.HandleFabricRequest(context.Background(), FabricRequest{
Method: http.MethodGet,
Path: "/platform-admin/ui-manifest",
Path: "/admin/ui-manifest",
Scope: "organization",
ServiceClass: "platform_admin",
ServiceClass: "admin-ingress",
})
if err != nil {
t.Fatalf("projection: %v", err)
@@ -118,11 +118,11 @@ func TestAdminRuntimeDispatcherUsesControlAPIProjectionClientForReadRequests(t *
response, err := dispatcher.HandleFabricRequest(context.Background(), FabricRequest{
Method: http.MethodGet,
Path: "/platform-admin/nodes",
Path: "/admin/nodes",
Query: "limit=10",
Host: "admin.example.test",
Scope: "platform",
ServiceClass: "platform_admin",
ServiceClass: "admin-ingress",
})
if err != nil {
t.Fatalf("projection: %v", err)
@@ -133,10 +133,10 @@ func TestAdminRuntimeDispatcherUsesControlAPIProjectionClientForReadRequests(t *
string(response.Body) != `{"schema_version":"control.projection.v1","ok":true}` {
t.Fatalf("response = %+v body=%s", response, string(response.Body))
}
if client.request.Path != "/platform-admin/nodes" ||
if client.request.Path != "/admin/nodes" ||
client.request.Query != "limit=10" ||
client.request.Scope != "platform" ||
client.request.ServiceClass != "platform_admin" {
client.request.ServiceClass != "admin-ingress" {
t.Fatalf("request = %+v", client.request)
}
}
@@ -145,9 +145,9 @@ func TestAdminRuntimeDispatcherReportsProjectionClientFailure(t *testing.T) {
dispatcher := AdminRuntimeDispatcher{ProjectionClient: failingProjectionClient{}, Now: fixedEnvelopeNow}
response, err := dispatcher.HandleFabricRequest(context.Background(), FabricRequest{
Method: http.MethodGet,
Path: "/platform-admin/nodes",
Path: "/admin/nodes",
Scope: "platform",
ServiceClass: "platform_admin",
ServiceClass: "admin-ingress",
})
if err != nil {
t.Fatalf("projection: %v", err)
@@ -175,9 +175,9 @@ func TestAdminRuntimeDispatcherRejectsInvalidProjectionResponseSchema(t *testing
}
response, err := dispatcher.HandleFabricRequest(context.Background(), FabricRequest{
Method: http.MethodGet,
Path: "/platform-admin/nodes",
Path: "/admin/nodes",
Scope: "platform",
ServiceClass: "platform_admin",
ServiceClass: "admin-ingress",
})
if err != nil {
t.Fatalf("projection: %v", err)
@@ -13,7 +13,6 @@ import (
type ListenerConfig struct {
RuntimeConfig
HTTPAddr string
HTTPSAddr string
TLSCertFile string
TLSKeyFile string
@@ -23,9 +22,7 @@ type ListenerConfig struct {
type ListenerStatus struct {
SchemaVersion string `json:"schema_version"`
Running bool `json:"running"`
HTTPRunning bool `json:"http_running"`
HTTPSRunning bool `json:"https_running"`
HTTPAddr string `json:"http_addr,omitempty"`
HTTPSAddr string `json:"https_addr,omitempty"`
Reason string `json:"reason,omitempty"`
Errors []string `json:"errors,omitempty"`
@@ -34,7 +31,6 @@ type ListenerStatus struct {
type Manager struct {
mu sync.Mutex
http *http.Server
https *http.Server
status ListenerStatus
now func() time.Time
@@ -56,19 +52,9 @@ func (m *Manager) Apply(ctx context.Context, cfg ListenerConfig) ListenerStatus
ObservedAt: m.observedAt(),
}
errorsOut := []string{}
if strings.TrimSpace(cfg.HTTPAddr) == "" {
cfg.HTTPAddr = ":80"
}
if strings.TrimSpace(cfg.HTTPSAddr) == "" {
cfg.HTTPSAddr = ":443"
}
if server, addr, err := startHTTPServer(ctx, cfg.HTTPAddr, runtime.HTTPHandler()); err == nil {
m.http = server
status.HTTPRunning = true
status.HTTPAddr = addr
} else {
errorsOut = append(errorsOut, "http:"+err.Error())
}
if cfg.TLSCertFile == "" || cfg.TLSKeyFile == "" {
errorsOut = append(errorsOut, "https:tls_cert_file_and_key_file_required")
} else if server, addr, err := startHTTPSServer(ctx, cfg.HTTPSAddr, cfg.TLSCertFile, cfg.TLSKeyFile, runtime.HTTPSHandler()); err == nil {
@@ -78,7 +64,7 @@ func (m *Manager) Apply(ctx context.Context, cfg ListenerConfig) ListenerStatus
} else {
errorsOut = append(errorsOut, "https:"+err.Error())
}
status.Running = status.HTTPRunning || status.HTTPSRunning
status.Running = status.HTTPSRunning
if len(errorsOut) > 0 {
status.Errors = errorsOut
if status.Running {
@@ -118,10 +104,6 @@ func (m *Manager) Status() ListenerStatus {
func (m *Manager) stopLocked(ctx context.Context) error {
var out error
if m.http != nil {
out = errors.Join(out, m.http.Shutdown(ctx))
m.http = nil
}
if m.https != nil {
out = errors.Join(out, m.https.Shutdown(ctx))
m.https = nil
@@ -137,24 +119,6 @@ func (m *Manager) observedAt() string {
return now.Format(time.RFC3339Nano)
}
func startHTTPServer(ctx context.Context, addr string, handler http.Handler) (*http.Server, string, error) {
listener, err := net.Listen("tcp", addr)
if err != nil {
return nil, "", err
}
server := &http.Server{Handler: handler, ReadHeaderTimeout: 5 * time.Second}
go func() {
<-ctx.Done()
_ = server.Shutdown(context.Background())
}()
go func() {
if err := server.Serve(listener); err != nil && !errors.Is(err, http.ErrServerClosed) {
_ = server.Close()
}
}()
return server, listener.Addr().String(), nil
}
func startHTTPSServer(ctx context.Context, addr, certFile, keyFile string, handler http.Handler) (*http.Server, string, error) {
cert, err := tls.LoadX509KeyPair(certFile, keyFile)
if err != nil {
@@ -8,7 +8,6 @@ import (
"crypto/x509/pkix"
"encoding/pem"
"math/big"
"net/http"
"os"
"path/filepath"
"strings"
@@ -16,37 +15,6 @@ import (
"time"
)
func TestManagerStartsHTTPRedirectAndStops(t *testing.T) {
manager := NewManager()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
status := manager.Apply(ctx, ListenerConfig{
RuntimeConfig: RuntimeConfig{ServiceType: "admin-ingress", Scope: "platform", ServiceClasses: []string{"platform_admin"}},
HTTPAddr: "127.0.0.1:0",
HTTPSAddr: "127.0.0.1:0",
})
if !status.HTTPRunning || status.HTTPSRunning || !status.Running || status.HTTPAddr == "" {
t.Fatalf("status = %+v", status)
}
if status.Reason != "partial" || !containsError(status.Errors, "https:tls_cert_file_and_key_file_required") {
t.Fatalf("status = %+v", status)
}
client := &http.Client{CheckRedirect: func(*http.Request, []*http.Request) error { return http.ErrUseLastResponse }}
resp, err := client.Get("http://" + status.HTTPAddr + "/cluster-admin")
if err != nil {
t.Fatalf("http get: %v", err)
}
_ = resp.Body.Close()
if resp.StatusCode != http.StatusPermanentRedirect {
t.Fatalf("status = %d", resp.StatusCode)
}
stopped := manager.Stop(context.Background())
if stopped.Running || stopped.Reason != "stopped" {
t.Fatalf("stopped = %+v", stopped)
}
}
func TestManagerStartsHTTPSWhenCertificateProvided(t *testing.T) {
dir := t.TempDir()
certFile, keyFile := writeSelfSignedCert(t, dir)
@@ -56,12 +24,29 @@ func TestManagerStartsHTTPSWhenCertificateProvided(t *testing.T) {
status := manager.Apply(ctx, ListenerConfig{
RuntimeConfig: RuntimeConfig{ServiceType: "admin-ingress", Scope: "platform", ServiceClasses: []string{"platform_admin"}},
HTTPAddr: "127.0.0.1:0",
HTTPSAddr: "127.0.0.1:0",
TLSCertFile: certFile,
TLSKeyFile: keyFile,
})
if !status.HTTPRunning || !status.HTTPSRunning || status.HTTPAddr == "" || status.HTTPSAddr == "" || len(status.Errors) != 0 {
if !status.HTTPSRunning || !status.Running || status.HTTPSAddr == "" || len(status.Errors) != 0 {
t.Fatalf("status = %+v", status)
}
}
func TestManagerDoesNotStartHTTPWithoutExplicitAddress(t *testing.T) {
dir := t.TempDir()
certFile, keyFile := writeSelfSignedCert(t, dir)
manager := NewManager()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
status := manager.Apply(ctx, ListenerConfig{
RuntimeConfig: RuntimeConfig{ServiceType: "admin-ingress", Scope: "platform", ServiceClasses: []string{"platform_admin"}},
HTTPSAddr: "127.0.0.1:0",
TLSCertFile: certFile,
TLSKeyFile: keyFile,
})
if !status.HTTPSRunning || !status.Running || status.HTTPSAddr == "" || len(status.Errors) != 0 {
t.Fatalf("status = %+v", status)
}
}
@@ -14,7 +14,6 @@ type RuntimeConfig struct {
Scope string
ServiceClasses []string
TLSMode string
HTTPPort int
HTTPSPort int
}
@@ -59,23 +58,6 @@ type Response struct {
ObservedAt string `json:"observed_at"`
}
func (r Runtime) HTTPHandler() http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
if strings.HasPrefix(req.URL.Path, "/.well-known/acme-challenge/") {
writeJSON(w, http.StatusNotFound, r.response("not_found", "acme_challenge_backend_not_configured", ""))
return
}
if req.URL.Path == "/healthz" || req.URL.Path == "/readyz" {
writeJSON(w, http.StatusOK, r.response("ready", "http_redirect_runtime_ready", ""))
return
}
target := "https://" + req.Host + req.URL.RequestURI()
w.Header().Set("Location", target)
w.Header().Set("Cache-Control", "no-store")
w.WriteHeader(http.StatusPermanentRedirect)
})
}
func (r Runtime) HTTPSHandler() http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
if req.URL.Path == "/healthz" || req.URL.Path == "/readyz" {
@@ -98,7 +80,7 @@ func (r Runtime) HTTPSHandler() http.Handler {
writeJSON(w, http.StatusNotImplemented, r.response("blocked", "fabric_service_channel_binding_not_implemented", serviceClass))
return
}
scope := scopeForServiceClass(serviceClass, r.Config.Scope)
scope := scopeForServiceClass(serviceClass, req.URL.Path, r.Config.Scope)
body, err := io.ReadAll(http.MaxBytesReader(w, req.Body, 1<<20))
if err != nil {
writeJSON(w, http.StatusRequestEntityTooLarge, r.response("blocked", "request_body_too_large", serviceClass))
@@ -146,32 +128,38 @@ func (r Runtime) response(status, reason, serviceClass string) Response {
}
}
func scopeForServiceClass(serviceClass string, fallback string) string {
func scopeForServiceClass(serviceClass string, path string, fallback string) string {
path = strings.Trim(strings.ToLower(path), "/")
switch strings.TrimSpace(serviceClass) {
case "platform_admin":
return "platform"
case "cluster_admin":
return "cluster"
case "organization_portal":
return "organization"
case "user_portal":
return "user"
case "admin-ingress":
if strings.HasPrefix(path, "clusters/") {
return "cluster"
}
return firstNonEmpty(strings.TrimSpace(fallback), "platform")
case "public-ingress":
if strings.HasPrefix(path, "users/") {
return "user"
}
return firstNonEmpty(strings.TrimSpace(fallback), "organization")
default:
return strings.TrimSpace(fallback)
}
}
func firstNonEmpty(value string, fallback string) string {
if strings.TrimSpace(value) != "" {
return strings.TrimSpace(value)
}
return strings.TrimSpace(fallback)
}
func serviceClassFromPath(path string) string {
path = strings.Trim(strings.ToLower(path), "/")
switch {
case strings.HasPrefix(path, "platform-admin"):
return "platform_admin"
case strings.HasPrefix(path, "cluster-admin"):
return "cluster_admin"
case strings.HasPrefix(path, "organizations/"):
return "organization_portal"
case strings.HasPrefix(path, "users/"):
return "user_portal"
case strings.HasPrefix(path, "admin/"), strings.HasPrefix(path, "platform/"), strings.HasPrefix(path, "clusters/"):
return "admin-ingress"
case strings.HasPrefix(path, "public/"), strings.HasPrefix(path, "organizations/"), strings.HasPrefix(path, "users/"):
return "public-ingress"
default:
return ""
}
@@ -10,31 +10,16 @@ import (
"time"
)
func TestHTTPHandlerRedirectsToHTTPS(t *testing.T) {
runtime := Runtime{Config: RuntimeConfig{ServiceType: "admin-ingress", Scope: "platform"}}
req := httptest.NewRequest(http.MethodGet, "http://admin.example.test/cluster-admin/dashboard?x=1", nil)
rec := httptest.NewRecorder()
runtime.HTTPHandler().ServeHTTP(rec, req)
if rec.Code != http.StatusPermanentRedirect {
t.Fatalf("status = %d", rec.Code)
}
if rec.Header().Get("Location") != "https://admin.example.test/cluster-admin/dashboard?x=1" {
t.Fatalf("Location = %q", rec.Header().Get("Location"))
}
}
func TestHTTPSHandlerBlocksUnknownServiceClass(t *testing.T) {
runtime := Runtime{
Config: RuntimeConfig{
ServiceType: "public-ingress",
Scope: "organization",
ServiceClasses: []string{"organization_portal", "user_portal"},
ServiceClasses: []string{"public-ingress", "public-ingress"},
},
Now: fixedNow,
}
req := httptest.NewRequest(http.MethodGet, "https://org.example.test/platform-admin/root", nil)
req := httptest.NewRequest(http.MethodGet, "https://org.example.test/admin/root", nil)
rec := httptest.NewRecorder()
runtime.HTTPSHandler().ServeHTTP(rec, req)
@@ -46,7 +31,7 @@ func TestHTTPSHandlerBlocksUnknownServiceClass(t *testing.T) {
if err := json.Unmarshal(rec.Body.Bytes(), &payload); err != nil {
t.Fatalf("decode response: %v", err)
}
if payload.Reason != "service_class_not_allowed" || payload.ServiceClass != "platform_admin" || payload.Scope != "organization" {
if payload.Reason != "service_class_not_allowed" || payload.ServiceClass != "admin-ingress" || payload.Scope != "organization" {
t.Fatalf("payload = %+v", payload)
}
}
@@ -56,11 +41,11 @@ func TestHTTPSHandlerRequiresFabricServiceChannelBinding(t *testing.T) {
Config: RuntimeConfig{
ServiceType: "admin-ingress",
Scope: "platform",
ServiceClasses: []string{"platform_admin", "cluster_admin"},
ServiceClasses: []string{"admin-ingress", "admin-ingress"},
},
Now: fixedNow,
}
req := httptest.NewRequest(http.MethodPost, "https://admin.example.test/platform-admin/root", nil)
req := httptest.NewRequest(http.MethodPost, "https://admin.example.test/admin/root", nil)
rec := httptest.NewRecorder()
runtime.HTTPSHandler().ServeHTTP(rec, req)
@@ -73,7 +58,7 @@ func TestHTTPSHandlerRequiresFabricServiceChannelBinding(t *testing.T) {
t.Fatalf("decode response: %v", err)
}
if payload.Reason != "fabric_service_channel_binding_not_implemented" ||
payload.ServiceClass != "platform_admin" ||
payload.ServiceClass != "admin-ingress" ||
payload.ObservedAt != "2026-05-17T00:00:00Z" {
t.Fatalf("payload = %+v", payload)
}
@@ -91,13 +76,13 @@ func TestHTTPSHandlerForwardsAllowedRequestToBinder(t *testing.T) {
Config: RuntimeConfig{
ServiceType: "admin-ingress",
Scope: "platform",
ServiceClasses: []string{"platform_admin", "cluster_admin"},
ServiceClasses: []string{"admin-ingress", "admin-ingress"},
},
Binder: binder,
Now: fixedNow,
}
req := httptest.NewRequest(http.MethodPost, "https://admin.example.test/platform-admin/root?tab=nodes", strings.NewReader(`{"hello":"world"}`))
req.Header.Set("X-RAP-Service-Class", "platform_admin")
req := httptest.NewRequest(http.MethodPost, "https://admin.example.test/admin/root?tab=nodes", strings.NewReader(`{"hello":"world"}`))
req.Header.Set("X-RAP-Service-Class", "admin-ingress")
req.Header.Set("Authorization", "Bearer secret")
req.Header.Set("X-Trace-ID", "trace-1")
rec := httptest.NewRecorder()
@@ -110,9 +95,9 @@ func TestHTTPSHandlerForwardsAllowedRequestToBinder(t *testing.T) {
if rec.Header().Get("X-RAP-Result") != "accepted" || rec.Body.String() != `{"ok":true}` {
t.Fatalf("unexpected response headers=%v body=%s", rec.Header(), rec.Body.String())
}
if binder.request.ServiceClass != "platform_admin" ||
if binder.request.ServiceClass != "admin-ingress" ||
binder.request.Scope != "platform" ||
binder.request.Path != "/platform-admin/root" ||
binder.request.Path != "/admin/root" ||
binder.request.Query != "tab=nodes" ||
string(binder.request.Body) != `{"hello":"world"}` {
t.Fatalf("request = %+v", binder.request)
@@ -128,12 +113,12 @@ func TestHTTPSHandlerDerivesFabricScopeFromServiceClass(t *testing.T) {
Config: RuntimeConfig{
ServiceType: "admin-ingress",
Scope: "platform",
ServiceClasses: []string{"platform_admin", "cluster_admin"},
ServiceClasses: []string{"admin-ingress", "admin-ingress"},
},
Binder: binder,
Now: fixedNow,
}
req := httptest.NewRequest(http.MethodGet, "https://admin.example.test/cluster-admin/ui-manifest", nil)
req := httptest.NewRequest(http.MethodGet, "https://admin.example.test/clusters/ui-manifest", nil)
rec := httptest.NewRecorder()
runtime.HTTPSHandler().ServeHTTP(rec, req)
@@ -141,18 +126,18 @@ func TestHTTPSHandlerDerivesFabricScopeFromServiceClass(t *testing.T) {
if rec.Code != http.StatusOK {
t.Fatalf("status = %d body=%s", rec.Code, rec.Body.String())
}
if binder.request.ServiceClass != "cluster_admin" || binder.request.Scope != "cluster" {
if binder.request.ServiceClass != "admin-ingress" || binder.request.Scope != "cluster" {
t.Fatalf("request = %+v", binder.request)
}
}
func TestHTTPSHandlerReportsBinderFailure(t *testing.T) {
runtime := Runtime{
Config: RuntimeConfig{ServiceType: "admin-ingress", Scope: "platform", ServiceClasses: []string{"platform_admin"}},
Config: RuntimeConfig{ServiceType: "admin-ingress", Scope: "platform", ServiceClasses: []string{"admin-ingress"}},
Binder: failingBinder{},
Now: fixedNow,
}
req := httptest.NewRequest(http.MethodPost, "https://admin.example.test/platform-admin/root", nil)
req := httptest.NewRequest(http.MethodPost, "https://admin.example.test/admin/root", nil)
rec := httptest.NewRecorder()
runtime.HTTPSHandler().ServeHTTP(rec, req)