Record project continuation changes
This commit is contained in:
@@ -7,7 +7,7 @@ import (
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
|
||||
)
|
||||
|
||||
const Version = "0.1.0-c3"
|
||||
const Version = "0.2.256-c18z82"
|
||||
|
||||
func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) client.EnrollRequest {
|
||||
return client.EnrollRequest{
|
||||
@@ -17,18 +17,26 @@ func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) cli
|
||||
NodeFingerprint: identity.NodeFingerprint,
|
||||
PublicKey: identity.PublicKey,
|
||||
ReportedCapabilities: map[string]any{
|
||||
"can_accept_client_ingress": false,
|
||||
"can_accept_node_ingress": false,
|
||||
"can_route_mesh": false,
|
||||
"can_run_rdp_worker": true,
|
||||
"can_run_vnc_worker": false,
|
||||
"can_run_vpn_exit": false,
|
||||
"can_run_vpn_connector": false,
|
||||
"can_run_file_cache": false,
|
||||
"can_run_update_cache": false,
|
||||
"can_run_video_relay": false,
|
||||
"native_node_agent_version": Version,
|
||||
"service_supervision_enabled": false,
|
||||
"can_accept_client_ingress": false,
|
||||
"can_accept_node_ingress": false,
|
||||
"can_route_mesh": false,
|
||||
"can_run_rdp_worker": true,
|
||||
"can_run_vnc_worker": false,
|
||||
"can_run_vpn_exit": true,
|
||||
"can_run_vpn_connector": true,
|
||||
"can_run_file_cache": false,
|
||||
"can_run_update_cache": false,
|
||||
"can_run_video_relay": false,
|
||||
"native_node_agent_version": Version,
|
||||
"node_update_plan_contract": "rap.node_update_plan.v1",
|
||||
"node_update_status_report": true,
|
||||
"host_agent_update_required": true,
|
||||
"service_supervision_enabled": false,
|
||||
"vpn_assignment_status": true,
|
||||
"vpn_packet_forwarding": true,
|
||||
"vpn_fabric_packet_transport": true,
|
||||
"vpn_local_gateway_shortcut": true,
|
||||
"external_backend_entry_proxy": true,
|
||||
},
|
||||
ReportedFacts: map[string]any{
|
||||
"os": runtime.GOOS,
|
||||
@@ -45,13 +53,28 @@ func HeartbeatPayload() client.HeartbeatRequest {
|
||||
HealthStatus: "healthy",
|
||||
ReportedVersion: Version,
|
||||
Capabilities: map[string]any{
|
||||
"native_node_agent": true,
|
||||
"native_node_agent": true,
|
||||
"node_update_plan_contract": "rap.node_update_plan.v1",
|
||||
"node_update_status_report": true,
|
||||
"vpn_assignment_status": true,
|
||||
"vpn_packet_forwarding": true,
|
||||
"vpn_fabric_packet_transport": true,
|
||||
"vpn_local_gateway_shortcut": true,
|
||||
"external_backend_entry_proxy": true,
|
||||
},
|
||||
ServiceStates: map[string]any{
|
||||
"workload_supervision": "not_implemented_c3",
|
||||
},
|
||||
Metadata: map[string]any{
|
||||
"stage": "c3",
|
||||
"update_runtime": map[string]any{
|
||||
"product": "rap-node-agent",
|
||||
"current_version": Version,
|
||||
"host_agent_present": true,
|
||||
"self_update_enabled": true,
|
||||
"rollback_executor_ready": true,
|
||||
"reason": "host-agent updater active",
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -260,6 +260,7 @@ type SyntheticMeshRouteConfig struct {
|
||||
}
|
||||
|
||||
type SyntheticMeshConfig struct {
|
||||
Raw json.RawMessage `json:"-"`
|
||||
Enabled bool `json:"enabled"`
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
@@ -286,6 +287,17 @@ type SyntheticMeshConfig struct {
|
||||
ProductionForwarding bool `json:"production_forwarding"`
|
||||
}
|
||||
|
||||
func (c *SyntheticMeshConfig) UnmarshalJSON(data []byte) error {
|
||||
type syntheticMeshConfigAlias SyntheticMeshConfig
|
||||
var decoded syntheticMeshConfigAlias
|
||||
if err := json.Unmarshal(data, &decoded); err != nil {
|
||||
return err
|
||||
}
|
||||
*c = SyntheticMeshConfig(decoded)
|
||||
c.Raw = append(c.Raw[:0], data...)
|
||||
return nil
|
||||
}
|
||||
|
||||
type FabricServiceChannelRemediationCommand struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
CommandID string `json:"command_id"`
|
||||
|
||||
@@ -28,6 +28,9 @@ type Config struct {
|
||||
MeshProductionForwardingEnabled bool
|
||||
MeshProductionObservationSinkCapacity int
|
||||
MeshListenAddr string
|
||||
MeshListenPortMode string
|
||||
MeshListenAutoPortStart int
|
||||
MeshListenAutoPortEnd int
|
||||
MeshAdvertiseEndpoint string
|
||||
MeshAdvertiseEndpointsJSON string
|
||||
MeshAdvertiseTransport string
|
||||
@@ -58,6 +61,9 @@ func Load(args []string, env map[string]string) (Config, error) {
|
||||
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getEnvBool(env, "RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production fabric-control direct next-hop forwarding gate. Disabled by default.")
|
||||
fs.IntVar(&cfg.MeshProductionObservationSinkCapacity, "mesh-production-observation-sink-capacity", getEnvSignedInt(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Bounded local metadata-only production envelope observation sink capacity. Disabled when 0.")
|
||||
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getEnv(env, "RAP_MESH_LISTEN_ADDR", ""), "Listen address for disabled-by-default C17E synthetic mesh HTTP endpoint.")
|
||||
fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getEnv(env, "RAP_MESH_LISTEN_PORT_MODE", "manual"), "Mesh listen port behavior: manual, auto, or disabled.")
|
||||
fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
|
||||
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint reported to the Control Plane. Empty disables endpoint reporting.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "JSON array of advertised mesh endpoint candidates, including private/corporate endpoints.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getEnv(env, "RAP_MESH_ADVERTISE_TRANSPORT", "direct_tcp_tls"), "Transport label for the advertised mesh endpoint.")
|
||||
@@ -70,7 +76,7 @@ func Load(args []string, env map[string]string) (Config, error) {
|
||||
heartbeatSeconds := getEnvInt(env, "RAP_HEARTBEAT_INTERVAL_SECONDS", 15)
|
||||
fs.DurationVar(&cfg.HeartbeatInterval, "heartbeat-interval", time.Duration(heartbeatSeconds)*time.Second, "Heartbeat interval.")
|
||||
enrollmentPollIntervalSeconds := getEnvInt(env, "RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5)
|
||||
enrollmentPollTimeoutSeconds := getEnvInt(env, "RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 600)
|
||||
enrollmentPollTimeoutSeconds := getEnvSignedInt(env, "RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 0)
|
||||
fs.DurationVar(&cfg.EnrollmentPollInterval, "enrollment-poll-interval", time.Duration(enrollmentPollIntervalSeconds)*time.Second, "Enrollment approval polling interval.")
|
||||
fs.DurationVar(&cfg.EnrollmentPollTimeout, "enrollment-poll-timeout", time.Duration(enrollmentPollTimeoutSeconds)*time.Second, "Enrollment approval polling timeout.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
@@ -84,6 +90,7 @@ func Load(args []string, env map[string]string) (Config, error) {
|
||||
cfg.NodeName = strings.TrimSpace(cfg.NodeName)
|
||||
cfg.StateDir = strings.TrimSpace(cfg.StateDir)
|
||||
cfg.MeshListenAddr = strings.TrimSpace(cfg.MeshListenAddr)
|
||||
cfg.MeshListenPortMode = strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode))
|
||||
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
|
||||
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
|
||||
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
|
||||
@@ -117,6 +124,20 @@ func Load(args []string, env map[string]string) (Config, error) {
|
||||
if cfg.MeshProductionObservationSinkCapacity > MaxMeshProductionObservationSinkCapacity {
|
||||
return Config{}, errors.New("mesh production observation sink capacity exceeds maximum")
|
||||
}
|
||||
switch cfg.MeshListenPortMode {
|
||||
case "", "manual", "auto", "disabled":
|
||||
if cfg.MeshListenPortMode == "" {
|
||||
cfg.MeshListenPortMode = "manual"
|
||||
}
|
||||
default:
|
||||
return Config{}, errors.New("mesh listen port mode must be manual, auto, or disabled")
|
||||
}
|
||||
if cfg.MeshListenAutoPortStart <= 0 || cfg.MeshListenAutoPortEnd <= 0 {
|
||||
return Config{}, errors.New("mesh listen auto port range must be positive")
|
||||
}
|
||||
if cfg.MeshListenAutoPortStart > cfg.MeshListenAutoPortEnd {
|
||||
return Config{}, errors.New("mesh listen auto port start must be less than or equal to end")
|
||||
}
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -22,6 +22,9 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
|
||||
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED": "true",
|
||||
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "5",
|
||||
"RAP_MESH_LISTEN_ADDR": "127.0.0.1:19001",
|
||||
"RAP_MESH_LISTEN_PORT_MODE": "auto",
|
||||
"RAP_MESH_LISTEN_AUTO_PORT_START": "19010",
|
||||
"RAP_MESH_LISTEN_AUTO_PORT_END": "19020",
|
||||
"RAP_MESH_ADVERTISE_ENDPOINT": "https://node-a.example.test:443/",
|
||||
"RAP_MESH_ADVERTISE_ENDPOINTS_JSON": `[{"endpoint_id":"node-a-lan","address":"10.10.0.20:19001"}]`,
|
||||
"RAP_MESH_ADVERTISE_TRANSPORT": "wss",
|
||||
@@ -65,6 +68,9 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
|
||||
if cfg.MeshListenAddr != "127.0.0.1:19001" {
|
||||
t.Fatalf("MeshListenAddr = %q", cfg.MeshListenAddr)
|
||||
}
|
||||
if cfg.MeshListenPortMode != "auto" || cfg.MeshListenAutoPortStart != 19010 || cfg.MeshListenAutoPortEnd != 19020 {
|
||||
t.Fatalf("unexpected mesh listen port config: %+v", cfg)
|
||||
}
|
||||
if cfg.MeshAdvertiseEndpoint != "https://node-a.example.test:443" ||
|
||||
cfg.MeshAdvertiseEndpointsJSON == "" ||
|
||||
cfg.MeshAdvertiseTransport != "wss" ||
|
||||
@@ -81,6 +87,19 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadConfigDefaultsEnrollmentPollingToNoTimeout(t *testing.T) {
|
||||
cfg, err := Load(nil, map[string]string{
|
||||
"RAP_BACKEND_URL": "http://backend/api/v1",
|
||||
"RAP_NODE_NAME": "node-a",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("load config: %v", err)
|
||||
}
|
||||
if cfg.EnrollmentPollTimeout != 0 {
|
||||
t.Fatalf("EnrollmentPollTimeout = %s, want no timeout", cfg.EnrollmentPollTimeout)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadConfigRejectsNegativeProductionObservationSinkCapacity(t *testing.T) {
|
||||
_, err := Load(nil, map[string]string{
|
||||
"RAP_BACKEND_URL": "http://backend/api/v1",
|
||||
|
||||
@@ -0,0 +1,135 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultContainerName = "rap-node-agent"
|
||||
DefaultImage = "rap-node-agent:latest"
|
||||
DefaultStateDir = "/var/lib/rap-node-agent"
|
||||
DefaultNetwork = "host"
|
||||
)
|
||||
|
||||
type RuntimeConfig struct {
|
||||
BackendURL string
|
||||
ClusterID string
|
||||
JoinToken string
|
||||
NodeName string
|
||||
Image string
|
||||
ContainerName string
|
||||
StateDir string
|
||||
Network string
|
||||
RestartPolicy string
|
||||
PullImage bool
|
||||
Replace bool
|
||||
DockerVPNGatewayEnabled bool
|
||||
WorkloadSupervisionEnabled bool
|
||||
MeshSyntheticRuntimeEnabled bool
|
||||
MeshProductionForwardingEnabled bool
|
||||
MeshListenAddr string
|
||||
MeshListenPortMode string
|
||||
MeshListenAutoPortStart int
|
||||
MeshListenAutoPortEnd int
|
||||
MeshAdvertiseEndpoint string
|
||||
MeshAdvertiseEndpointsJSON string
|
||||
MeshAdvertiseTransport string
|
||||
MeshConnectivityMode string
|
||||
MeshNATType string
|
||||
MeshRegion string
|
||||
HeartbeatIntervalSeconds int
|
||||
EnrollmentPollIntervalSeconds int
|
||||
EnrollmentPollTimeoutSeconds int
|
||||
ExtraEnv []string
|
||||
AdditionalDockerRunArgs []string
|
||||
ProductionObservationSinkCap int
|
||||
ImageArtifactURLs []string
|
||||
ImageArtifactSHA256 string
|
||||
ImageArtifactSizeBytes int64
|
||||
}
|
||||
|
||||
func (cfg RuntimeConfig) Normalize() RuntimeConfig {
|
||||
cfg.BackendURL = strings.TrimRight(strings.TrimSpace(cfg.BackendURL), "/")
|
||||
cfg.ClusterID = strings.TrimSpace(cfg.ClusterID)
|
||||
cfg.JoinToken = strings.TrimSpace(cfg.JoinToken)
|
||||
cfg.NodeName = strings.TrimSpace(cfg.NodeName)
|
||||
cfg.Image = firstNonEmpty(cfg.Image, DefaultImage)
|
||||
cfg.ContainerName = firstNonEmpty(cfg.ContainerName, DefaultContainerName)
|
||||
cfg.StateDir = firstNonEmpty(cfg.StateDir, DefaultStateDir)
|
||||
cfg.Network = firstNonEmpty(cfg.Network, DefaultNetwork)
|
||||
cfg.RestartPolicy = firstNonEmpty(cfg.RestartPolicy, "unless-stopped")
|
||||
cfg.MeshListenAddr = strings.TrimSpace(cfg.MeshListenAddr)
|
||||
cfg.MeshListenPortMode = strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode))
|
||||
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
|
||||
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
|
||||
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
|
||||
cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode)
|
||||
cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType)
|
||||
cfg.MeshRegion = strings.TrimSpace(cfg.MeshRegion)
|
||||
cfg.ImageArtifactSHA256 = strings.TrimSpace(cfg.ImageArtifactSHA256)
|
||||
if cfg.HeartbeatIntervalSeconds == 0 {
|
||||
cfg.HeartbeatIntervalSeconds = 15
|
||||
}
|
||||
if cfg.EnrollmentPollIntervalSeconds == 0 {
|
||||
cfg.EnrollmentPollIntervalSeconds = 5
|
||||
}
|
||||
return cfg
|
||||
}
|
||||
|
||||
func (cfg RuntimeConfig) ValidateInstall() error {
|
||||
cfg = cfg.Normalize()
|
||||
var missing []string
|
||||
if cfg.BackendURL == "" {
|
||||
missing = append(missing, "backend-url")
|
||||
}
|
||||
if cfg.ClusterID == "" {
|
||||
missing = append(missing, "cluster-id")
|
||||
}
|
||||
if cfg.NodeName == "" {
|
||||
missing = append(missing, "node-name")
|
||||
}
|
||||
if len(missing) > 0 {
|
||||
return fmt.Errorf("missing required install settings: %s", strings.Join(missing, ", "))
|
||||
}
|
||||
if cfg.JoinToken == "" && !cfg.Replace {
|
||||
return errors.New("join-token is required for first install; pass -replace only when updating an already enrolled local state")
|
||||
}
|
||||
if cfg.HeartbeatIntervalSeconds <= 0 {
|
||||
return errors.New("heartbeat interval must be positive")
|
||||
}
|
||||
if cfg.EnrollmentPollIntervalSeconds <= 0 {
|
||||
return errors.New("enrollment poll interval must be positive")
|
||||
}
|
||||
if cfg.EnrollmentPollTimeoutSeconds < 0 {
|
||||
return errors.New("enrollment poll timeout must not be negative")
|
||||
}
|
||||
switch cfg.MeshListenPortMode {
|
||||
case "", "manual", "auto", "disabled":
|
||||
default:
|
||||
return errors.New("mesh listen port mode must be manual, auto, or disabled")
|
||||
}
|
||||
if cfg.MeshListenAutoPortStart < 0 || cfg.MeshListenAutoPortEnd < 0 {
|
||||
return errors.New("mesh listen auto port range must not be negative")
|
||||
}
|
||||
if cfg.MeshListenAutoPortStart > 0 && cfg.MeshListenAutoPortEnd > 0 && cfg.MeshListenAutoPortStart > cfg.MeshListenAutoPortEnd {
|
||||
return errors.New("mesh listen auto port start must be less than or equal to end")
|
||||
}
|
||||
if cfg.ProductionObservationSinkCap < 0 {
|
||||
return errors.New("production observation sink capacity must not be negative")
|
||||
}
|
||||
for _, item := range cfg.ExtraEnv {
|
||||
if !strings.Contains(item, "=") {
|
||||
return fmt.Errorf("extra env %q must be KEY=VALUE", item)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func firstNonEmpty(value, fallback string) string {
|
||||
if strings.TrimSpace(value) == "" {
|
||||
return fallback
|
||||
}
|
||||
return strings.TrimSpace(value)
|
||||
}
|
||||
@@ -0,0 +1,335 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type CommandRunner interface {
|
||||
Run(ctx context.Context, name string, args ...string) (string, error)
|
||||
}
|
||||
|
||||
type ExecRunner struct{}
|
||||
|
||||
func (ExecRunner) Run(ctx context.Context, name string, args ...string) (string, error) {
|
||||
cmd := exec.CommandContext(ctx, name, args...)
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
return string(out), fmt.Errorf("%s %s: %w\n%s", name, strings.Join(args, " "), err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
type DockerManager struct {
|
||||
Runner CommandRunner
|
||||
Binary string
|
||||
}
|
||||
|
||||
var statHostPath = os.Stat
|
||||
|
||||
type InstallResult struct {
|
||||
ContainerName string
|
||||
Image string
|
||||
Replaced bool
|
||||
Pulled bool
|
||||
Loaded bool
|
||||
ContainerID string
|
||||
}
|
||||
|
||||
func (m DockerManager) Install(ctx context.Context, cfg RuntimeConfig) (InstallResult, error) {
|
||||
if err := cfg.ValidateInstall(); err != nil {
|
||||
return InstallResult{}, err
|
||||
}
|
||||
cfg = cfg.Normalize()
|
||||
runner := m.Runner
|
||||
if runner == nil {
|
||||
runner = ExecRunner{}
|
||||
}
|
||||
docker := firstNonEmpty(m.Binary, "docker")
|
||||
result := InstallResult{ContainerName: cfg.ContainerName, Image: cfg.Image}
|
||||
|
||||
if err := PrepareStateDir(cfg.StateDir); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if cfg.DockerVPNGatewayEnabled {
|
||||
if err := ensureHostTunDevice(ctx, runner); err != nil {
|
||||
return result, err
|
||||
}
|
||||
}
|
||||
|
||||
if cfg.PullImage {
|
||||
if _, err := runner.Run(ctx, docker, "pull", cfg.Image); err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.Pulled = true
|
||||
} else if len(cfg.ImageArtifactURLs) > 0 {
|
||||
loaded, err := m.ensureImageFromArtifact(ctx, runner, docker, cfg)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.Loaded = loaded
|
||||
}
|
||||
|
||||
if cfg.Replace {
|
||||
if _, err := runner.Run(ctx, docker, "rm", "-f", cfg.ContainerName); err != nil && !isNoSuchContainerError(err) {
|
||||
return result, err
|
||||
}
|
||||
result.Replaced = true
|
||||
}
|
||||
|
||||
args := DockerRunArgs(cfg)
|
||||
out, err := runner.Run(ctx, docker, args...)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.ContainerID = strings.TrimSpace(out)
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func ensureHostTunDevice(ctx context.Context, runner CommandRunner) error {
|
||||
if _, err := statHostPath("/dev/net/tun"); err == nil {
|
||||
return nil
|
||||
}
|
||||
if _, err := runner.Run(ctx, "modprobe", "tun"); err != nil {
|
||||
return fmt.Errorf("docker vpn gateway requires host /dev/net/tun; modprobe tun failed: %w", err)
|
||||
}
|
||||
if _, err := statHostPath("/dev/net/tun"); err != nil {
|
||||
return fmt.Errorf("docker vpn gateway requires host /dev/net/tun after modprobe tun: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m DockerManager) ensureImageFromArtifact(ctx context.Context, runner CommandRunner, docker string, cfg RuntimeConfig) (bool, error) {
|
||||
if _, err := runner.Run(ctx, docker, "image", "inspect", cfg.Image); err == nil && !cfg.Replace {
|
||||
return false, nil
|
||||
}
|
||||
path, err := downloadFirstArtifact(ctx, cfg.ImageArtifactURLs, cfg.ImageArtifactSHA256, cfg.ImageArtifactSizeBytes)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
defer os.Remove(path)
|
||||
if _, err := runner.Run(ctx, docker, "load", "-i", path); err != nil {
|
||||
return false, err
|
||||
}
|
||||
if _, err := runner.Run(ctx, docker, "image", "inspect", cfg.Image); err != nil {
|
||||
return true, fmt.Errorf("loaded artifact but image %q is not available: %w", cfg.Image, err)
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func downloadFirstArtifact(ctx context.Context, urls []string, expectedSHA256 string, expectedSizeBytes int64) (string, error) {
|
||||
var lastErr error
|
||||
for _, rawURL := range urls {
|
||||
rawURL = strings.TrimSpace(rawURL)
|
||||
if rawURL == "" {
|
||||
continue
|
||||
}
|
||||
for attempt := 1; attempt <= 3; attempt++ {
|
||||
path, err := downloadArtifact(ctx, rawURL, expectedSHA256, expectedSizeBytes)
|
||||
if err == nil {
|
||||
return path, nil
|
||||
}
|
||||
lastErr = err
|
||||
}
|
||||
}
|
||||
if lastErr != nil {
|
||||
return "", lastErr
|
||||
}
|
||||
return "", fmt.Errorf("no artifact URLs configured")
|
||||
}
|
||||
|
||||
func downloadArtifact(ctx context.Context, rawURL, expectedSHA256 string, expectedSizeBytes int64) (string, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("download artifact %s: %w", rawURL, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return "", fmt.Errorf("download artifact %s: %s", rawURL, resp.Status)
|
||||
}
|
||||
file, err := os.CreateTemp("", "rap-docker-image-*.tar")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
path := file.Name()
|
||||
hasher := sha256.New()
|
||||
written, copyErr := io.Copy(io.MultiWriter(file, hasher), resp.Body)
|
||||
closeErr := file.Close()
|
||||
if copyErr != nil {
|
||||
os.Remove(path)
|
||||
return "", copyErr
|
||||
}
|
||||
if closeErr != nil {
|
||||
os.Remove(path)
|
||||
return "", closeErr
|
||||
}
|
||||
if resp.ContentLength >= 0 && written != resp.ContentLength {
|
||||
os.Remove(path)
|
||||
return "", fmt.Errorf("artifact download truncated for %s: got %d bytes want content-length %d", rawURL, written, resp.ContentLength)
|
||||
}
|
||||
if expectedSizeBytes > 0 && written != expectedSizeBytes {
|
||||
if strings.TrimSpace(expectedSHA256) != "" {
|
||||
os.Remove(path)
|
||||
return "", fmt.Errorf("artifact size mismatch for %s: got %d bytes want %d", rawURL, written, expectedSizeBytes)
|
||||
}
|
||||
fmt.Printf("artifact size mismatch for %s: got %d bytes want %d; proceeding without checksum for backward-compatible installs\n", rawURL, written, expectedSizeBytes)
|
||||
}
|
||||
actual := hex.EncodeToString(hasher.Sum(nil))
|
||||
if expected := strings.TrimSpace(expectedSHA256); expected != "" && !strings.EqualFold(actual, expected) {
|
||||
os.Remove(path)
|
||||
return "", fmt.Errorf("artifact checksum mismatch for %s: got %s want %s", rawURL, actual, expected)
|
||||
}
|
||||
return path, nil
|
||||
}
|
||||
|
||||
func (m DockerManager) Status(ctx context.Context, containerName string) (string, error) {
|
||||
containerName = firstNonEmpty(containerName, DefaultContainerName)
|
||||
runner := m.Runner
|
||||
if runner == nil {
|
||||
runner = ExecRunner{}
|
||||
}
|
||||
docker := firstNonEmpty(m.Binary, "docker")
|
||||
return runner.Run(ctx, docker, "ps", "-a", "--filter", "name=^/"+containerName+"$", "--format", "{{.Names}}\t{{.Image}}\t{{.Status}}")
|
||||
}
|
||||
|
||||
func PrepareStateDir(stateDir string) error {
|
||||
stateDir = strings.TrimSpace(stateDir)
|
||||
if stateDir == "" || !looksLikeHostPath(stateDir) {
|
||||
return nil
|
||||
}
|
||||
if err := os.MkdirAll(stateDir, 0o777); err != nil {
|
||||
return fmt.Errorf("prepare state dir %q: %w", stateDir, err)
|
||||
}
|
||||
if err := os.Chmod(stateDir, 0o777); err != nil {
|
||||
if isAccessDenied(err) {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("chmod state dir %q: %w", stateDir, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func DockerRunArgs(cfg RuntimeConfig) []string {
|
||||
cfg = cfg.Normalize()
|
||||
args := []string{
|
||||
"run", "-d",
|
||||
"--name", cfg.ContainerName,
|
||||
"--restart", cfg.RestartPolicy,
|
||||
"--network", cfg.Network,
|
||||
"-v", cfg.StateDir + ":/var/lib/rap-node-agent",
|
||||
}
|
||||
if cfg.DockerVPNGatewayEnabled {
|
||||
args = append(args,
|
||||
"--privileged",
|
||||
"--cap-add", "NET_ADMIN",
|
||||
"--device", "/dev/net/tun:/dev/net/tun",
|
||||
)
|
||||
}
|
||||
args = append(args, cfg.AdditionalDockerRunArgs...)
|
||||
for _, env := range NodeAgentEnv(cfg) {
|
||||
args = append(args, "-e", env)
|
||||
}
|
||||
args = append(args, cfg.Image)
|
||||
return args
|
||||
}
|
||||
|
||||
func NodeAgentEnv(cfg RuntimeConfig) []string {
|
||||
return NodeAgentEnvWithStateDir(cfg, "/var/lib/rap-node-agent")
|
||||
}
|
||||
|
||||
func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
|
||||
cfg = cfg.Normalize()
|
||||
stateDir = firstNonEmpty(stateDir, cfg.StateDir)
|
||||
env := []string{
|
||||
"RAP_BACKEND_URL=" + cfg.BackendURL,
|
||||
"RAP_CLUSTER_ID=" + cfg.ClusterID,
|
||||
"RAP_NODE_NAME=" + cfg.NodeName,
|
||||
"RAP_NODE_STATE_DIR=" + stateDir,
|
||||
"RAP_HEARTBEAT_INTERVAL_SECONDS=" + strconv.Itoa(cfg.HeartbeatIntervalSeconds),
|
||||
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS=" + strconv.Itoa(cfg.EnrollmentPollIntervalSeconds),
|
||||
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=" + strconv.Itoa(cfg.EnrollmentPollTimeoutSeconds),
|
||||
"RAP_WORKLOAD_SUPERVISION_ENABLED=" + boolString(cfg.WorkloadSupervisionEnabled),
|
||||
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=" + boolString(cfg.MeshSyntheticRuntimeEnabled),
|
||||
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED=" + boolString(cfg.MeshProductionForwardingEnabled),
|
||||
}
|
||||
if cfg.JoinToken != "" {
|
||||
env = append(env, "RAP_JOIN_TOKEN="+cfg.JoinToken)
|
||||
}
|
||||
if cfg.MeshListenAddr != "" {
|
||||
env = append(env, "RAP_MESH_LISTEN_ADDR="+cfg.MeshListenAddr)
|
||||
}
|
||||
if cfg.MeshListenPortMode != "" {
|
||||
env = append(env, "RAP_MESH_LISTEN_PORT_MODE="+cfg.MeshListenPortMode)
|
||||
}
|
||||
if cfg.MeshListenAutoPortStart > 0 {
|
||||
env = append(env, "RAP_MESH_LISTEN_AUTO_PORT_START="+strconv.Itoa(cfg.MeshListenAutoPortStart))
|
||||
}
|
||||
if cfg.MeshListenAutoPortEnd > 0 {
|
||||
env = append(env, "RAP_MESH_LISTEN_AUTO_PORT_END="+strconv.Itoa(cfg.MeshListenAutoPortEnd))
|
||||
}
|
||||
if cfg.MeshAdvertiseEndpoint != "" {
|
||||
env = append(env, "RAP_MESH_ADVERTISE_ENDPOINT="+cfg.MeshAdvertiseEndpoint)
|
||||
}
|
||||
if cfg.MeshAdvertiseEndpointsJSON != "" {
|
||||
env = append(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON="+cfg.MeshAdvertiseEndpointsJSON)
|
||||
}
|
||||
if cfg.MeshAdvertiseTransport != "" {
|
||||
env = append(env, "RAP_MESH_ADVERTISE_TRANSPORT="+cfg.MeshAdvertiseTransport)
|
||||
}
|
||||
if cfg.MeshConnectivityMode != "" {
|
||||
env = append(env, "RAP_MESH_CONNECTIVITY_MODE="+cfg.MeshConnectivityMode)
|
||||
}
|
||||
if cfg.MeshNATType != "" {
|
||||
env = append(env, "RAP_MESH_NAT_TYPE="+cfg.MeshNATType)
|
||||
}
|
||||
if cfg.MeshRegion != "" {
|
||||
env = append(env, "RAP_MESH_REGION="+cfg.MeshRegion)
|
||||
}
|
||||
if cfg.ProductionObservationSinkCap > 0 {
|
||||
env = append(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY="+strconv.Itoa(cfg.ProductionObservationSinkCap))
|
||||
}
|
||||
env = append(env, cfg.ExtraEnv...)
|
||||
return env
|
||||
}
|
||||
|
||||
func RedactedArgs(args []string) []string {
|
||||
out := append([]string(nil), args...)
|
||||
for i := 0; i < len(out)-1; i++ {
|
||||
if out[i] == "-e" && strings.HasPrefix(out[i+1], "RAP_JOIN_TOKEN=") {
|
||||
out[i+1] = "RAP_JOIN_TOKEN=***"
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func isNoSuchContainerError(err error) bool {
|
||||
value := strings.ToLower(err.Error())
|
||||
return strings.Contains(value, "no such container") || strings.Contains(value, "no such object")
|
||||
}
|
||||
|
||||
func looksLikeHostPath(value string) bool {
|
||||
if filepath.IsAbs(value) {
|
||||
return true
|
||||
}
|
||||
return strings.HasPrefix(value, ".") || strings.HasPrefix(value, "~") || strings.Contains(value, "/") || strings.Contains(value, `\`)
|
||||
}
|
||||
|
||||
func boolString(value bool) string {
|
||||
if value {
|
||||
return "true"
|
||||
}
|
||||
return "false"
|
||||
}
|
||||
@@ -0,0 +1,366 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type recordingRunner struct {
|
||||
calls [][]string
|
||||
}
|
||||
|
||||
func (r *recordingRunner) Run(_ context.Context, name string, args ...string) (string, error) {
|
||||
r.calls = append(r.calls, append([]string{name}, args...))
|
||||
if len(args) > 0 && args[0] == "run" {
|
||||
return "container-1\n", nil
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
type imageMissingRunner struct {
|
||||
calls [][]string
|
||||
inspectSeen int
|
||||
}
|
||||
|
||||
func (r *imageMissingRunner) Run(_ context.Context, name string, args ...string) (string, error) {
|
||||
r.calls = append(r.calls, append([]string{name}, args...))
|
||||
if len(args) >= 3 && args[0] == "image" && args[1] == "inspect" {
|
||||
r.inspectSeen++
|
||||
if r.inspectSeen == 1 {
|
||||
return "", fmt.Errorf("No such image")
|
||||
}
|
||||
return "[]", nil
|
||||
}
|
||||
if len(args) > 0 && args[0] == "run" {
|
||||
return "container-1\n", nil
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
type imagePresentRunner struct {
|
||||
calls [][]string
|
||||
}
|
||||
|
||||
func (r *imagePresentRunner) Run(_ context.Context, name string, args ...string) (string, error) {
|
||||
r.calls = append(r.calls, append([]string{name}, args...))
|
||||
if len(args) > 0 && args[0] == "run" {
|
||||
return "container-1\n", nil
|
||||
}
|
||||
return "[]", nil
|
||||
}
|
||||
|
||||
func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
|
||||
args := DockerRunArgs(RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1/",
|
||||
ClusterID: "cluster-1",
|
||||
JoinToken: "join-secret",
|
||||
NodeName: "node-a",
|
||||
Image: "rap-node-agent:test",
|
||||
ContainerName: "rap-node-agent-node-a",
|
||||
StateDir: "/srv/rap/node-a",
|
||||
MeshSyntheticRuntimeEnabled: true,
|
||||
MeshListenAddr: ":19131",
|
||||
MeshAdvertiseEndpoint: "http://10.0.0.11:19131/",
|
||||
MeshConnectivityMode: "private_lan",
|
||||
})
|
||||
|
||||
joined := strings.Join(args, "\x00")
|
||||
for _, want := range []string{
|
||||
"run", "-d", "--name\x00rap-node-agent-node-a", "--network\x00host",
|
||||
"-v\x00/srv/rap/node-a:/var/lib/rap-node-agent",
|
||||
"RAP_BACKEND_URL=http://control/api/v1",
|
||||
"RAP_CLUSTER_ID=cluster-1",
|
||||
"RAP_JOIN_TOKEN=join-secret",
|
||||
"RAP_NODE_STATE_DIR=/var/lib/rap-node-agent",
|
||||
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=0",
|
||||
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true",
|
||||
"RAP_MESH_LISTEN_ADDR=:19131",
|
||||
"RAP_MESH_ADVERTISE_ENDPOINT=http://10.0.0.11:19131",
|
||||
"RAP_MESH_CONNECTIVITY_MODE=private_lan",
|
||||
"rap-node-agent:test",
|
||||
} {
|
||||
if !strings.Contains(joined, want) {
|
||||
t.Fatalf("docker args missing %q in %#v", want, args)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDockerRunArgsEnableVPNGatewayDevice(t *testing.T) {
|
||||
args := DockerRunArgs(RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
JoinToken: "join-secret",
|
||||
NodeName: "node-a",
|
||||
StateDir: "rap-node-state",
|
||||
DockerVPNGatewayEnabled: true,
|
||||
})
|
||||
|
||||
joined := strings.Join(args, "\x00")
|
||||
for _, want := range []string{
|
||||
"--privileged",
|
||||
"--cap-add\x00NET_ADMIN",
|
||||
"--device\x00/dev/net/tun:/dev/net/tun",
|
||||
} {
|
||||
if !strings.Contains(joined, want) {
|
||||
t.Fatalf("docker vpn gateway args missing %q in %#v", want, args)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrepareStateDirCreatesWritableHostPath(t *testing.T) {
|
||||
dir := filepath.Join(t.TempDir(), "node-state")
|
||||
if err := PrepareStateDir(dir); err != nil {
|
||||
t.Fatalf("prepare state dir: %v", err)
|
||||
}
|
||||
info, err := os.Stat(dir)
|
||||
if err != nil {
|
||||
t.Fatalf("stat state dir: %v", err)
|
||||
}
|
||||
if !info.IsDir() {
|
||||
t.Fatalf("state path is not a directory")
|
||||
}
|
||||
if info.Mode().Perm()&0o777 != 0o777 {
|
||||
t.Fatalf("state dir mode = %v, want writable for container nonroot user", info.Mode().Perm())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPrepareStateDirSkipsNamedVolume(t *testing.T) {
|
||||
if err := PrepareStateDir("rap-node-state"); err != nil {
|
||||
t.Fatalf("named volume should be ignored: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchDockerInstallProfileBuildsRuntimeConfig(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/api/v1/node-agents/docker-install-profile" {
|
||||
t.Fatalf("path = %s", r.URL.Path)
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"docker_install_profile": map[string]any{
|
||||
"cluster_id": "cluster-1",
|
||||
"backend_url": "https://control.example.test/api/v1",
|
||||
"join_token": "rap_join_profile",
|
||||
"node_name": "node-a",
|
||||
"image": "rap-node-agent:test",
|
||||
"artifact_endpoints": []string{"https://cache.example.test/artifacts"},
|
||||
"docker_image_artifact": map[string]any{
|
||||
"kind": "docker_image_tar",
|
||||
"image": "rap-node-agent:test",
|
||||
"file_name": "rap-node-agent-test.tar",
|
||||
"size_bytes": 21,
|
||||
},
|
||||
"container_name": "rap-node-agent-node-a",
|
||||
"state_dir": "/var/lib/rap/nodes/node-a",
|
||||
"network": "host",
|
||||
"restart_policy": "unless-stopped",
|
||||
"replace": true,
|
||||
"mesh_synthetic_runtime_enabled": true,
|
||||
"mesh_connectivity_mode": "outbound_only",
|
||||
},
|
||||
})
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
profile, err := FetchDockerInstallProfile(context.Background(), ProfileRequest{
|
||||
URL: server.URL + "/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
InstallToken: "rap_join_profile",
|
||||
NodeName: "node-a",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("fetch profile: %v", err)
|
||||
}
|
||||
cfg := RuntimeConfigFromProfile(profile).Normalize()
|
||||
if cfg.BackendURL != "https://control.example.test/api/v1" ||
|
||||
cfg.ClusterID != "cluster-1" ||
|
||||
cfg.JoinToken != "rap_join_profile" ||
|
||||
cfg.ContainerName != "rap-node-agent-node-a" ||
|
||||
len(cfg.ImageArtifactURLs) != 1 ||
|
||||
cfg.ImageArtifactSizeBytes != 21 ||
|
||||
!cfg.MeshSyntheticRuntimeEnabled ||
|
||||
cfg.MeshConnectivityMode != "outbound_only" {
|
||||
t.Fatalf("unexpected cfg: %+v", cfg)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInstallLoadsImageArtifactWhenImageMissing(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte("fake docker image tar"))
|
||||
}))
|
||||
defer server.Close()
|
||||
runner := &imageMissingRunner{}
|
||||
|
||||
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
JoinToken: "join-secret",
|
||||
NodeName: "node-a",
|
||||
Image: "rap-node-agent:test",
|
||||
ContainerName: "rap-node-agent-node-a",
|
||||
StateDir: "rap-node-state",
|
||||
Replace: true,
|
||||
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
|
||||
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
|
||||
ImageArtifactSizeBytes: 21,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("install: %v", err)
|
||||
}
|
||||
if !result.Loaded || result.ContainerID != "container-1" {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
joined := strings.Join(flattenCalls(runner.calls), "\x00")
|
||||
if !strings.Contains(joined, "load\x00-i") || !strings.Contains(joined, "run\x00-d") {
|
||||
t.Fatalf("expected docker load and run calls, got %#v", runner.calls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInstallAcceptsSizeMismatchWhenChecksumMissing(t *testing.T) {
|
||||
const payload = "fake docker image tar"
|
||||
const wrongSize = 999
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte(payload))
|
||||
}))
|
||||
defer server.Close()
|
||||
runner := &imageMissingRunner{}
|
||||
|
||||
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
JoinToken: "join-secret",
|
||||
NodeName: "node-a",
|
||||
Image: "rap-node-agent:test",
|
||||
ContainerName: "rap-node-agent-node-a",
|
||||
StateDir: "rap-node-state",
|
||||
Replace: true,
|
||||
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
|
||||
ImageArtifactSHA256: "", // intentionally absent -> size mismatch should not block install
|
||||
ImageArtifactSizeBytes: wrongSize,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("install: %v", err)
|
||||
}
|
||||
if !result.Loaded || result.ContainerID != "container-1" {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInstallReloadsImageArtifactWhenReplacingMutableTag(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte("fake docker image tar"))
|
||||
}))
|
||||
defer server.Close()
|
||||
runner := &imagePresentRunner{}
|
||||
|
||||
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
JoinToken: "join-secret",
|
||||
NodeName: "node-a",
|
||||
Image: "rap-node-agent:test",
|
||||
ContainerName: "rap-node-agent-node-a",
|
||||
StateDir: "rap-node-state",
|
||||
Replace: true,
|
||||
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
|
||||
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
|
||||
ImageArtifactSizeBytes: 21,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("install: %v", err)
|
||||
}
|
||||
if !result.Loaded {
|
||||
t.Fatalf("expected image artifact reload, got %+v", result)
|
||||
}
|
||||
joined := strings.Join(flattenCalls(runner.calls), "\x00")
|
||||
if !strings.Contains(joined, "load\x00-i") {
|
||||
t.Fatalf("expected docker load even when image exists during replace, got %#v", runner.calls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDockerInstallLoadsExplicitArtifactBeforeReplace(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/rap-node-agent-test.tar" {
|
||||
t.Fatalf("unexpected path %s", r.URL.Path)
|
||||
}
|
||||
_, _ = w.Write([]byte("fake docker image tar"))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
runner := &imageMissingRunner{}
|
||||
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
JoinToken: "join-secret",
|
||||
NodeName: "node-a",
|
||||
Image: "rap-node-agent:test",
|
||||
ContainerName: "rap-node-agent-node-a",
|
||||
StateDir: "rap-node-state",
|
||||
Replace: true,
|
||||
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
|
||||
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
|
||||
ImageArtifactSizeBytes: 21,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("install: %v", err)
|
||||
}
|
||||
if !result.Loaded || !result.Replaced {
|
||||
t.Fatalf("expected explicit artifact load and replace, got %+v", result)
|
||||
}
|
||||
joined := strings.Join(flattenCalls(runner.calls), "\x00")
|
||||
if !strings.Contains(joined, "load\x00-i") {
|
||||
t.Fatalf("expected docker load call, got %#v", runner.calls)
|
||||
}
|
||||
}
|
||||
|
||||
func flattenCalls(calls [][]string) []string {
|
||||
out := []string{}
|
||||
for _, call := range calls {
|
||||
out = append(out, call...)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func TestInstallCanPullReplaceAndRedactsJoinToken(t *testing.T) {
|
||||
runner := &recordingRunner{}
|
||||
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
JoinToken: "join-secret",
|
||||
NodeName: "node-a",
|
||||
PullImage: true,
|
||||
Replace: true,
|
||||
ContainerName: "rap-node-agent-node-a",
|
||||
StateDir: "rap-node-state",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("install: %v", err)
|
||||
}
|
||||
if !result.Pulled || !result.Replaced || result.ContainerID != "container-1" {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
if len(runner.calls) != 3 {
|
||||
t.Fatalf("calls = %#v", runner.calls)
|
||||
}
|
||||
redacted := strings.Join(RedactedArgs(runner.calls[2][1:]), " ")
|
||||
if strings.Contains(redacted, "join-secret") || !strings.Contains(redacted, "RAP_JOIN_TOKEN=***") {
|
||||
t.Fatalf("redacted args leaked token: %s", redacted)
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateRequiresJoinTokenUnlessReplacingExistingState(t *testing.T) {
|
||||
err := RuntimeConfig{BackendURL: "http://control/api/v1", ClusterID: "cluster-1", NodeName: "node-a"}.ValidateInstall()
|
||||
if err == nil || !strings.Contains(err.Error(), "join-token") {
|
||||
t.Fatalf("expected join token validation error, got %v", err)
|
||||
}
|
||||
err = RuntimeConfig{BackendURL: "http://control/api/v1", ClusterID: "cluster-1", NodeName: "node-a", Replace: true}.ValidateInstall()
|
||||
if err != nil {
|
||||
t.Fatalf("replace update should allow missing join token: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,481 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultLinuxInstallRoot = "/opt/rap"
|
||||
DefaultLinuxStateRoot = "/var/lib/rap/nodes"
|
||||
DefaultLinuxConfigRoot = "/etc/rap"
|
||||
)
|
||||
|
||||
type LinuxInstallConfig struct {
|
||||
RuntimeConfig RuntimeConfig
|
||||
NodeID string
|
||||
InstallDir string
|
||||
StateDir string
|
||||
ConfigDir string
|
||||
UnitDir string
|
||||
StartupMode string
|
||||
ArtifactURLs []string
|
||||
ArtifactSHA256 string
|
||||
ArtifactSizeBytes int64
|
||||
Replace bool
|
||||
DryRun bool
|
||||
AutoUpdateEnabled bool
|
||||
AutoUpdateCurrentVersion string
|
||||
AutoUpdateChannel string
|
||||
AutoUpdateIntervalSeconds int
|
||||
AutoUpdateInitialDelaySeconds int
|
||||
AutoUpdateHealthTimeoutSeconds int
|
||||
HostAgentSourcePath string
|
||||
}
|
||||
|
||||
type LinuxInstallResult struct {
|
||||
NodeName string
|
||||
InstallDir string
|
||||
StateDir string
|
||||
ConfigDir string
|
||||
NodeAgentPath string
|
||||
HostAgentPath string
|
||||
EnvPath string
|
||||
UnitName string
|
||||
UnitPath string
|
||||
UpdaterUnitName string
|
||||
Downloaded bool
|
||||
Started bool
|
||||
UpdaterStarted bool
|
||||
}
|
||||
|
||||
type LinuxManager struct {
|
||||
Runner CommandRunner
|
||||
}
|
||||
|
||||
func LinuxInstallConfigFromProfile(profile LinuxInstallProfile) LinuxInstallConfig {
|
||||
stateDir := firstNonEmpty(profile.StateDir, filepath.Join(DefaultLinuxStateRoot, safeUnitSlug(profile.NodeName)))
|
||||
installDir := firstNonEmpty(profile.InstallDir, filepath.Join(DefaultLinuxInstallRoot, safeUnitSlug(profile.NodeName)))
|
||||
return LinuxInstallConfig{
|
||||
RuntimeConfig: RuntimeConfig{
|
||||
BackendURL: profile.BackendURL,
|
||||
ClusterID: profile.ClusterID,
|
||||
JoinToken: profile.JoinToken,
|
||||
NodeName: profile.NodeName,
|
||||
StateDir: stateDir,
|
||||
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
|
||||
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
|
||||
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
|
||||
MeshListenAddr: profile.MeshListenAddr,
|
||||
MeshListenPortMode: profile.MeshListenPortMode,
|
||||
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
|
||||
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
|
||||
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
|
||||
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
|
||||
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
|
||||
MeshConnectivityMode: profile.MeshConnectivityMode,
|
||||
MeshNATType: profile.MeshNATType,
|
||||
MeshRegion: profile.MeshRegion,
|
||||
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
|
||||
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
|
||||
EnrollmentPollTimeoutSeconds: profile.EnrollmentPollTimeoutSeconds,
|
||||
ProductionObservationSinkCap: profile.ProductionObservationSinkCapacity,
|
||||
},
|
||||
InstallDir: installDir,
|
||||
StateDir: stateDir,
|
||||
ConfigDir: filepath.Join(DefaultLinuxConfigRoot, safeUnitSlug(profile.NodeName)),
|
||||
StartupMode: firstNonEmpty(profile.StartupMode, "systemd"),
|
||||
ArtifactURLs: linuxArtifactURLs(profile),
|
||||
ArtifactSHA256: linuxArtifactSHA256(profile),
|
||||
ArtifactSizeBytes: linuxArtifactSizeBytes(profile),
|
||||
Replace: true,
|
||||
AutoUpdateEnabled: true,
|
||||
}
|
||||
}
|
||||
|
||||
func linuxArtifactURLs(profile LinuxInstallProfile) []string {
|
||||
if profile.NodeAgentArtifact != nil && len(profile.NodeAgentArtifact.URLs) > 0 {
|
||||
return append([]string(nil), profile.NodeAgentArtifact.URLs...)
|
||||
}
|
||||
if profile.NodeAgentArtifact == nil || strings.TrimSpace(profile.NodeAgentArtifact.FileName) == "" {
|
||||
return nil
|
||||
}
|
||||
out := []string{}
|
||||
fileName := strings.TrimLeft(strings.TrimSpace(profile.NodeAgentArtifact.FileName), "/")
|
||||
for _, endpoint := range profile.ArtifactEndpoints {
|
||||
if trimmed := strings.TrimRight(strings.TrimSpace(endpoint), "/"); trimmed != "" {
|
||||
out = append(out, trimmed+"/"+fileName)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func linuxArtifactSHA256(profile LinuxInstallProfile) string {
|
||||
if profile.NodeAgentArtifact == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(profile.NodeAgentArtifact.SHA256)
|
||||
}
|
||||
|
||||
func linuxArtifactSizeBytes(profile LinuxInstallProfile) int64 {
|
||||
if profile.NodeAgentArtifact == nil {
|
||||
return 0
|
||||
}
|
||||
return profile.NodeAgentArtifact.SizeBytes
|
||||
}
|
||||
|
||||
func (m LinuxManager) Install(ctx context.Context, cfg LinuxInstallConfig) (LinuxInstallResult, error) {
|
||||
cfg.NodeID = strings.TrimSpace(cfg.NodeID)
|
||||
cfg.RuntimeConfig.Replace = cfg.Replace
|
||||
cfg.RuntimeConfig.StateDir = firstNonEmpty(cfg.StateDir, cfg.RuntimeConfig.StateDir)
|
||||
cfg.RuntimeConfig = cfg.RuntimeConfig.Normalize()
|
||||
if err := cfg.RuntimeConfig.ValidateInstall(); err != nil {
|
||||
return LinuxInstallResult{}, err
|
||||
}
|
||||
slug := safeUnitSlug(cfg.RuntimeConfig.NodeName)
|
||||
cfg.InstallDir = firstNonEmpty(cfg.InstallDir, filepath.Join(DefaultLinuxInstallRoot, slug))
|
||||
cfg.StateDir = firstNonEmpty(cfg.RuntimeConfig.StateDir, filepath.Join(DefaultLinuxStateRoot, slug))
|
||||
cfg.ConfigDir = firstNonEmpty(cfg.ConfigDir, filepath.Join(DefaultLinuxConfigRoot, slug))
|
||||
cfg.UnitDir = firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir)
|
||||
cfg.StartupMode = strings.ToLower(firstNonEmpty(cfg.StartupMode, "systemd"))
|
||||
unitName := "rap-node-agent-" + slug + ".service"
|
||||
result := LinuxInstallResult{
|
||||
NodeName: cfg.RuntimeConfig.NodeName,
|
||||
InstallDir: cfg.InstallDir,
|
||||
StateDir: cfg.StateDir,
|
||||
ConfigDir: cfg.ConfigDir,
|
||||
NodeAgentPath: filepath.Join(cfg.InstallDir, "rap-node-agent"),
|
||||
HostAgentPath: filepath.Join(cfg.InstallDir, "rap-host-agent"),
|
||||
EnvPath: filepath.Join(cfg.ConfigDir, "rap-node-agent.env"),
|
||||
UnitName: unitName,
|
||||
UnitPath: filepath.Join(cfg.UnitDir, unitName),
|
||||
}
|
||||
if cfg.DryRun {
|
||||
return result, nil
|
||||
}
|
||||
if runtime.GOOS != "linux" {
|
||||
return result, fmt.Errorf("linux install is only supported on linux hosts")
|
||||
}
|
||||
if err := os.MkdirAll(cfg.InstallDir, 0o755); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if err := os.MkdirAll(cfg.StateDir, 0o700); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if err := os.MkdirAll(cfg.ConfigDir, 0o755); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if len(cfg.ArtifactURLs) > 0 && (cfg.Replace || !fileExists(result.NodeAgentPath)) {
|
||||
m.stopService(ctx, result.UnitName)
|
||||
path, err := downloadFirstArtifact(ctx, cfg.ArtifactURLs, cfg.ArtifactSHA256, cfg.ArtifactSizeBytes)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
defer os.Remove(path)
|
||||
if err := copyFile(path, result.NodeAgentPath, 0o755); err != nil {
|
||||
m.stopService(ctx, result.UnitName)
|
||||
if retryErr := copyFile(path, result.NodeAgentPath, 0o755); retryErr != nil {
|
||||
return result, err
|
||||
}
|
||||
}
|
||||
result.Downloaded = true
|
||||
}
|
||||
if !fileExists(result.NodeAgentPath) {
|
||||
return result, fmt.Errorf("node-agent binary is missing at %s and no artifact was available", result.NodeAgentPath)
|
||||
}
|
||||
if err := os.WriteFile(result.EnvPath, []byte(linuxEnvFile(cfg.RuntimeConfig, cfg.StateDir)), 0o600); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if cfg.StartupMode != "none" {
|
||||
if err := os.MkdirAll(cfg.UnitDir, 0o755); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if err := os.WriteFile(result.UnitPath, []byte(linuxNodeAgentUnit(result)), 0o644); err != nil {
|
||||
return result, err
|
||||
}
|
||||
runner := m.runner()
|
||||
if _, err := runner.Run(ctx, "systemctl", "daemon-reload"); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if _, err := runner.Run(ctx, "systemctl", "enable", "--now", result.UnitName); err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.Started = true
|
||||
}
|
||||
return installLinuxHostAgentUpdater(ctx, m, result, cfg)
|
||||
}
|
||||
|
||||
func (m LinuxManager) stopService(ctx context.Context, unitName string) {
|
||||
if strings.TrimSpace(unitName) == "" {
|
||||
return
|
||||
}
|
||||
_, _ = m.runner().Run(ctx, "systemctl", "stop", unitName)
|
||||
}
|
||||
|
||||
func (m LinuxManager) runner() CommandRunner {
|
||||
if m.Runner != nil {
|
||||
return m.Runner
|
||||
}
|
||||
return ExecRunner{}
|
||||
}
|
||||
|
||||
func linuxEnvFile(cfg RuntimeConfig, stateDir string) string {
|
||||
lines := []string{}
|
||||
for _, env := range NodeAgentEnvWithStateDir(cfg, stateDir) {
|
||||
key, value, ok := strings.Cut(env, "=")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
lines = append(lines, key+"="+systemdQuote(value))
|
||||
}
|
||||
return strings.Join(lines, "\n") + "\n"
|
||||
}
|
||||
|
||||
func linuxNodeAgentUnit(result LinuxInstallResult) string {
|
||||
return fmt.Sprintf(`[Unit]
|
||||
Description=RAP node-agent %s
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
EnvironmentFile=%s
|
||||
ExecStart=%s
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
`, result.NodeName, systemdQuote(result.EnvPath), systemdQuote(result.NodeAgentPath))
|
||||
}
|
||||
|
||||
func installLinuxHostAgentUpdater(ctx context.Context, m LinuxManager, result LinuxInstallResult, cfg LinuxInstallConfig) (LinuxInstallResult, error) {
|
||||
if !cfg.AutoUpdateEnabled || strings.EqualFold(cfg.StartupMode, "none") {
|
||||
return result, nil
|
||||
}
|
||||
if cfg.AutoUpdateCurrentVersion == "" || (cfg.Replace && !result.Downloaded) {
|
||||
cfg.AutoUpdateCurrentVersion = "0.0.0"
|
||||
}
|
||||
if err := installHostAgentBinary(cfg.HostAgentSourcePath, result.HostAgentPath); err != nil {
|
||||
return result, err
|
||||
}
|
||||
interval := cfg.AutoUpdateIntervalSeconds
|
||||
if interval == 0 {
|
||||
interval = 21600
|
||||
}
|
||||
initialDelay := cfg.AutoUpdateInitialDelaySeconds
|
||||
if initialDelay == 0 {
|
||||
initialDelay = 15
|
||||
}
|
||||
healthTimeout := cfg.AutoUpdateHealthTimeoutSeconds
|
||||
if healthTimeout == 0 {
|
||||
healthTimeout = 30
|
||||
}
|
||||
args := []string{
|
||||
result.HostAgentPath,
|
||||
"update-loop",
|
||||
"--backend-url", cfg.RuntimeConfig.BackendURL,
|
||||
"--cluster-id", cfg.RuntimeConfig.ClusterID,
|
||||
"--state-dir", result.StateDir,
|
||||
"--current-version", cfg.AutoUpdateCurrentVersion,
|
||||
"--os", "linux",
|
||||
"--arch", runtime.GOARCH,
|
||||
"--install-type", BinaryUpdateInstallType,
|
||||
"--binary-path", result.NodeAgentPath,
|
||||
"--systemd-unit", result.UnitName,
|
||||
"--health-timeout-seconds", fmt.Sprintf("%d", healthTimeout),
|
||||
"--interval-seconds", fmt.Sprintf("%d", interval),
|
||||
"--initial-delay-seconds", fmt.Sprintf("%d", initialDelay),
|
||||
"--host-agent-update-status-enabled",
|
||||
"--host-agent-current-version", firstNonEmpty(cfg.AutoUpdateCurrentVersion, "0.0.0"),
|
||||
"--host-agent-binary-path", result.HostAgentPath,
|
||||
}
|
||||
if strings.TrimSpace(cfg.NodeID) != "" {
|
||||
args = append(args, "--node-id", strings.TrimSpace(cfg.NodeID))
|
||||
}
|
||||
if strings.TrimSpace(cfg.AutoUpdateChannel) != "" {
|
||||
args = append(args, "--channel", strings.TrimSpace(cfg.AutoUpdateChannel))
|
||||
}
|
||||
unitName := "rap-host-agent-updater-" + safeUnitSlug(result.NodeName) + ".service"
|
||||
unitPath := filepath.Join(firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir), unitName)
|
||||
unit := fmt.Sprintf(`[Unit]
|
||||
Description=RAP host-agent updater for %s
|
||||
After=network-online.target %s
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=%s
|
||||
Restart=always
|
||||
RestartSec=30
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
`, result.NodeName, result.UnitName, systemdJoin(args))
|
||||
if err := os.WriteFile(unitPath, []byte(unit), 0o644); err != nil {
|
||||
return result, err
|
||||
}
|
||||
runner := m.runner()
|
||||
if _, err := runner.Run(ctx, "systemctl", "daemon-reload"); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if _, err := runner.Run(ctx, "systemctl", "enable", "--now", unitName); err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.UpdaterUnitName = unitName
|
||||
result.UpdaterStarted = true
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m LinuxManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (UpdateResult, error) {
|
||||
req.InstallType = firstNonEmpty(req.InstallType, BinaryUpdateInstallType)
|
||||
req.OS = firstNonEmpty(req.OS, "linux")
|
||||
req.Arch = firstNonEmpty(req.Arch, runtime.GOARCH)
|
||||
req = req.Normalize()
|
||||
var err error
|
||||
req, err = resolveUpdateRequest(req)
|
||||
if err != nil {
|
||||
return UpdateResult{}, err
|
||||
}
|
||||
plan, err := FetchNodeUpdatePlan(ctx, req)
|
||||
if err != nil {
|
||||
return UpdateResult{}, err
|
||||
}
|
||||
result := UpdateResult{Action: plan.Action, Reason: plan.Reason, TargetVersion: plan.TargetVersion, ContainerName: req.SystemdUnitName, NewImage: req.BinaryPath}
|
||||
if plan.Action != "update" {
|
||||
if !req.DryRun {
|
||||
status := statusFromNoopPlan(req, plan)
|
||||
if status.Payload == nil {
|
||||
status.Payload = map[string]any{}
|
||||
}
|
||||
status.Payload["systemd_unit"] = req.SystemdUnitName
|
||||
status.Payload["binary_path"] = req.BinaryPath
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, status)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
if plan.ProductionForwarding && !req.AllowProductionMesh {
|
||||
err := errors.New("refusing update plan with production forwarding enabled")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact == nil {
|
||||
err := errors.New("update plan has no artifact")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != BinaryUpdateInstallType {
|
||||
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if req.DryRun {
|
||||
return result, nil
|
||||
}
|
||||
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "download", Status: "started", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath}})
|
||||
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
|
||||
if err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "download", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
defer os.Remove(path)
|
||||
runner := m.runner()
|
||||
_, _ = runner.Run(ctx, "systemctl", "stop", req.SystemdUnitName)
|
||||
if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
result.Replaced = true
|
||||
if _, err := runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName); err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "restart", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "health_check", Status: "succeeded", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"systemd_unit": req.SystemdUnitName, "binary_path": req.BinaryPath}})
|
||||
_ = saveUpdateState(req.StateDir, UpdateState{Product: req.Product, CurrentVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion, Image: req.BinaryPath, UpdatedAt: time.Now().UTC()})
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m LinuxManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) error {
|
||||
req := cfg.Request
|
||||
req.InstallType = firstNonEmpty(req.InstallType, BinaryUpdateInstallType)
|
||||
req.OS = firstNonEmpty(req.OS, "linux")
|
||||
req.Arch = firstNonEmpty(req.Arch, runtime.GOARCH)
|
||||
cfg.Request = req
|
||||
return runLinuxUpdateLoop(ctx, m, cfg)
|
||||
}
|
||||
|
||||
func runLinuxUpdateLoop(ctx context.Context, m LinuxManager, cfg UpdateLoopConfig) error {
|
||||
if cfg.Interval == 0 {
|
||||
cfg.Interval = time.Hour
|
||||
}
|
||||
logf := cfg.Logf
|
||||
if logf == nil {
|
||||
logf = func(string, ...any) {}
|
||||
}
|
||||
if cfg.InitialDelay > 0 {
|
||||
if err := sleepContext(ctx, jitteredDuration(cfg.InitialDelay, cfg.Jitter)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
runs := 0
|
||||
lastTriggerGeneration := currentUpdateTriggerGeneration(cfg.Request.StateDir)
|
||||
for {
|
||||
runs++
|
||||
result, err := m.ApplyUpdate(ctx, cfg.Request)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNodeIdentityNotReady) {
|
||||
logf("linux_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, cfg.Request.StateDir)
|
||||
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
|
||||
return nil
|
||||
}
|
||||
if err := sleepUntilUpdateIntervalOrTrigger(ctx, cfg.Request.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
|
||||
return err
|
||||
}
|
||||
continue
|
||||
} else {
|
||||
logf("linux_update_loop run=%d status=failed error=%v", runs, err)
|
||||
if cfg.StopOnError {
|
||||
return err
|
||||
}
|
||||
}
|
||||
} else {
|
||||
logf("linux_update_loop run=%d action=%s reason=%s target=%s unit=%s replaced=%t", runs, result.Action, result.Reason, result.TargetVersion, result.ContainerName, result.Replaced)
|
||||
if result.Action == "update" && result.TargetVersion != "" {
|
||||
cfg.Request.CurrentVersion = result.TargetVersion
|
||||
}
|
||||
}
|
||||
if cfg.HostAgentUpdateEnabled {
|
||||
hostReq := cfg.HostAgentUpdateRequest
|
||||
hostReq.BackendURL = firstNonEmpty(hostReq.BackendURL, cfg.Request.BackendURL)
|
||||
hostReq.ClusterID = firstNonEmpty(hostReq.ClusterID, cfg.Request.ClusterID)
|
||||
hostReq.NodeID = firstNonEmpty(hostReq.NodeID, cfg.Request.NodeID)
|
||||
hostReq.StateDir = firstNonEmpty(hostReq.StateDir, cfg.Request.StateDir)
|
||||
hostReq.Channel = firstNonEmpty(hostReq.Channel, cfg.Request.Channel)
|
||||
hostReq.OS = firstNonEmpty(hostReq.OS, "linux")
|
||||
hostReq.Arch = firstNonEmpty(hostReq.Arch, runtime.GOARCH)
|
||||
hostReq.InstallType = firstNonEmpty(hostReq.InstallType, BinaryUpdateInstallType)
|
||||
hostResult, hostErr := (DockerManager{}).ApplyHostAgentUpdate(ctx, hostReq)
|
||||
if hostErr != nil {
|
||||
logf("linux_host_agent_update_loop run=%d status=failed error=%v", runs, hostErr)
|
||||
} else {
|
||||
logf("linux_host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t", runs, hostResult.Action, hostResult.Reason, hostResult.TargetVersion, hostResult.NewImage, hostResult.Replaced, hostResult.RestartNeeded)
|
||||
if hostResult.Action == "update" && hostResult.TargetVersion != "" && !hostResult.RolledBack {
|
||||
cfg.HostAgentUpdateRequest.CurrentVersion = hostResult.TargetVersion
|
||||
}
|
||||
}
|
||||
}
|
||||
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
|
||||
return nil
|
||||
}
|
||||
if err := sleepUntilUpdateIntervalOrTrigger(ctx, cfg.Request.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,333 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type DockerInstallProfile struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
BackendURL string `json:"backend_url"`
|
||||
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
|
||||
ArtifactEndpoints []string `json:"artifact_endpoints"`
|
||||
DockerImageArtifact *DockerArtifact `json:"docker_image_artifact"`
|
||||
JoinToken string `json:"join_token"`
|
||||
NodeName string `json:"node_name"`
|
||||
Image string `json:"image"`
|
||||
ContainerName string `json:"container_name"`
|
||||
StateDir string `json:"state_dir"`
|
||||
Network string `json:"network"`
|
||||
RestartPolicy string `json:"restart_policy"`
|
||||
PullImage bool `json:"pull_image"`
|
||||
Replace bool `json:"replace"`
|
||||
DockerVPNGatewayEnabled bool `json:"docker_vpn_gateway_enabled"`
|
||||
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
|
||||
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
|
||||
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
|
||||
MeshListenAddr string `json:"mesh_listen_addr"`
|
||||
MeshListenPortMode string `json:"mesh_listen_port_mode"`
|
||||
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
|
||||
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
|
||||
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
|
||||
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
|
||||
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
|
||||
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
|
||||
MeshNATType string `json:"mesh_nat_type"`
|
||||
MeshRegion string `json:"mesh_region"`
|
||||
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
|
||||
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
|
||||
EnrollmentPollTimeoutSeconds int `json:"enrollment_poll_timeout_seconds"`
|
||||
ProductionObservationSinkCapacity int `json:"production_observation_sink_capacity"`
|
||||
Roles []string `json:"roles"`
|
||||
}
|
||||
|
||||
type DockerArtifact struct {
|
||||
Kind string `json:"kind"`
|
||||
Image string `json:"image"`
|
||||
MediaType string `json:"media_type"`
|
||||
FileName string `json:"file_name"`
|
||||
URLs []string `json:"urls"`
|
||||
SHA256 string `json:"sha256"`
|
||||
SizeBytes int64 `json:"size_bytes"`
|
||||
}
|
||||
|
||||
type WindowsInstallProfile struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
BackendURL string `json:"backend_url"`
|
||||
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
|
||||
ArtifactEndpoints []string `json:"artifact_endpoints"`
|
||||
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
|
||||
JoinToken string `json:"join_token"`
|
||||
NodeName string `json:"node_name"`
|
||||
StateDir string `json:"state_dir"`
|
||||
InstallDir string `json:"install_dir"`
|
||||
StartupMode string `json:"startup_mode"`
|
||||
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
|
||||
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
|
||||
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
|
||||
MeshListenAddr string `json:"mesh_listen_addr"`
|
||||
MeshListenPortMode string `json:"mesh_listen_port_mode"`
|
||||
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
|
||||
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
|
||||
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
|
||||
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
|
||||
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
|
||||
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
|
||||
MeshNATType string `json:"mesh_nat_type"`
|
||||
MeshRegion string `json:"mesh_region"`
|
||||
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
|
||||
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
|
||||
EnrollmentPollTimeoutSeconds int `json:"enrollment_poll_timeout_seconds"`
|
||||
ProductionObservationSinkCapacity int `json:"production_observation_sink_capacity"`
|
||||
Roles []string `json:"roles"`
|
||||
}
|
||||
|
||||
type LinuxInstallProfile struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
BackendURL string `json:"backend_url"`
|
||||
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
|
||||
ArtifactEndpoints []string `json:"artifact_endpoints"`
|
||||
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
|
||||
JoinToken string `json:"join_token"`
|
||||
NodeName string `json:"node_name"`
|
||||
StateDir string `json:"state_dir"`
|
||||
InstallDir string `json:"install_dir"`
|
||||
StartupMode string `json:"startup_mode"`
|
||||
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
|
||||
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
|
||||
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
|
||||
MeshListenAddr string `json:"mesh_listen_addr"`
|
||||
MeshListenPortMode string `json:"mesh_listen_port_mode"`
|
||||
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
|
||||
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
|
||||
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
|
||||
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
|
||||
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
|
||||
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
|
||||
MeshNATType string `json:"mesh_nat_type"`
|
||||
MeshRegion string `json:"mesh_region"`
|
||||
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
|
||||
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
|
||||
EnrollmentPollTimeoutSeconds int `json:"enrollment_poll_timeout_seconds"`
|
||||
ProductionObservationSinkCapacity int `json:"production_observation_sink_capacity"`
|
||||
Roles []string `json:"roles"`
|
||||
}
|
||||
|
||||
type ProfileRequest struct {
|
||||
URL string
|
||||
ClusterID string
|
||||
InstallToken string
|
||||
NodeName string
|
||||
HTTPClient *http.Client
|
||||
}
|
||||
|
||||
func FetchDockerInstallProfile(ctx context.Context, req ProfileRequest) (DockerInstallProfile, error) {
|
||||
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
|
||||
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
|
||||
return DockerInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
|
||||
}
|
||||
if !strings.HasSuffix(url, "/node-agents/docker-install-profile") {
|
||||
url += "/node-agents/docker-install-profile"
|
||||
}
|
||||
body, err := json.Marshal(map[string]string{
|
||||
"cluster_id": strings.TrimSpace(req.ClusterID),
|
||||
"install_token": strings.TrimSpace(req.InstallToken),
|
||||
"node_name": strings.TrimSpace(req.NodeName),
|
||||
})
|
||||
if err != nil {
|
||||
return DockerInstallProfile{}, err
|
||||
}
|
||||
httpClient := req.HTTPClient
|
||||
if httpClient == nil {
|
||||
httpClient = &http.Client{Timeout: 20 * time.Second}
|
||||
}
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return DockerInstallProfile{}, err
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
resp, err := httpClient.Do(httpReq)
|
||||
if err != nil {
|
||||
return DockerInstallProfile{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return DockerInstallProfile{}, fmt.Errorf("fetch docker install profile: %s", resp.Status)
|
||||
}
|
||||
var envelope struct {
|
||||
Profile DockerInstallProfile `json:"docker_install_profile"`
|
||||
}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
|
||||
return DockerInstallProfile{}, err
|
||||
}
|
||||
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
|
||||
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
|
||||
}
|
||||
return envelope.Profile, nil
|
||||
}
|
||||
|
||||
func FetchWindowsInstallProfile(ctx context.Context, req ProfileRequest) (WindowsInstallProfile, error) {
|
||||
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
|
||||
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
|
||||
return WindowsInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
|
||||
}
|
||||
if !strings.HasSuffix(url, "/node-agents/windows-install-profile") {
|
||||
url += "/node-agents/windows-install-profile"
|
||||
}
|
||||
body, err := json.Marshal(map[string]string{
|
||||
"cluster_id": strings.TrimSpace(req.ClusterID),
|
||||
"install_token": strings.TrimSpace(req.InstallToken),
|
||||
"node_name": strings.TrimSpace(req.NodeName),
|
||||
})
|
||||
if err != nil {
|
||||
return WindowsInstallProfile{}, err
|
||||
}
|
||||
httpClient := req.HTTPClient
|
||||
if httpClient == nil {
|
||||
httpClient = &http.Client{Timeout: 20 * time.Second}
|
||||
}
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return WindowsInstallProfile{}, err
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
resp, err := httpClient.Do(httpReq)
|
||||
if err != nil {
|
||||
return WindowsInstallProfile{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return WindowsInstallProfile{}, fmt.Errorf("fetch windows install profile: %s", resp.Status)
|
||||
}
|
||||
var envelope struct {
|
||||
Profile WindowsInstallProfile `json:"windows_install_profile"`
|
||||
}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
|
||||
return WindowsInstallProfile{}, err
|
||||
}
|
||||
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
|
||||
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
|
||||
}
|
||||
return envelope.Profile, nil
|
||||
}
|
||||
|
||||
func FetchLinuxInstallProfile(ctx context.Context, req ProfileRequest) (LinuxInstallProfile, error) {
|
||||
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
|
||||
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
|
||||
return LinuxInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
|
||||
}
|
||||
if !strings.HasSuffix(url, "/node-agents/linux-install-profile") {
|
||||
url += "/node-agents/linux-install-profile"
|
||||
}
|
||||
body, err := json.Marshal(map[string]string{
|
||||
"cluster_id": strings.TrimSpace(req.ClusterID),
|
||||
"install_token": strings.TrimSpace(req.InstallToken),
|
||||
"node_name": strings.TrimSpace(req.NodeName),
|
||||
})
|
||||
if err != nil {
|
||||
return LinuxInstallProfile{}, err
|
||||
}
|
||||
httpClient := req.HTTPClient
|
||||
if httpClient == nil {
|
||||
httpClient = &http.Client{Timeout: 20 * time.Second}
|
||||
}
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return LinuxInstallProfile{}, err
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
resp, err := httpClient.Do(httpReq)
|
||||
if err != nil {
|
||||
return LinuxInstallProfile{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return LinuxInstallProfile{}, fmt.Errorf("fetch linux install profile: %s", resp.Status)
|
||||
}
|
||||
var envelope struct {
|
||||
Profile LinuxInstallProfile `json:"linux_install_profile"`
|
||||
}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
|
||||
return LinuxInstallProfile{}, err
|
||||
}
|
||||
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
|
||||
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
|
||||
}
|
||||
return envelope.Profile, nil
|
||||
}
|
||||
|
||||
func RuntimeConfigFromProfile(profile DockerInstallProfile) RuntimeConfig {
|
||||
return RuntimeConfig{
|
||||
BackendURL: profile.BackendURL,
|
||||
ClusterID: profile.ClusterID,
|
||||
JoinToken: profile.JoinToken,
|
||||
NodeName: profile.NodeName,
|
||||
Image: profile.Image,
|
||||
ContainerName: profile.ContainerName,
|
||||
StateDir: profile.StateDir,
|
||||
Network: profile.Network,
|
||||
RestartPolicy: profile.RestartPolicy,
|
||||
PullImage: profile.PullImage,
|
||||
Replace: profile.Replace,
|
||||
DockerVPNGatewayEnabled: profile.DockerVPNGatewayEnabled,
|
||||
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
|
||||
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
|
||||
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
|
||||
MeshListenAddr: profile.MeshListenAddr,
|
||||
MeshListenPortMode: profile.MeshListenPortMode,
|
||||
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
|
||||
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
|
||||
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
|
||||
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
|
||||
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
|
||||
MeshConnectivityMode: profile.MeshConnectivityMode,
|
||||
MeshNATType: profile.MeshNATType,
|
||||
MeshRegion: profile.MeshRegion,
|
||||
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
|
||||
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
|
||||
EnrollmentPollTimeoutSeconds: profile.EnrollmentPollTimeoutSeconds,
|
||||
ProductionObservationSinkCap: profile.ProductionObservationSinkCapacity,
|
||||
ImageArtifactURLs: dockerArtifactURLs(profile),
|
||||
ImageArtifactSHA256: dockerArtifactSHA256(profile),
|
||||
ImageArtifactSizeBytes: dockerArtifactSizeBytes(profile),
|
||||
}
|
||||
}
|
||||
|
||||
func dockerArtifactURLs(profile DockerInstallProfile) []string {
|
||||
if profile.DockerImageArtifact != nil && len(profile.DockerImageArtifact.URLs) > 0 {
|
||||
return append([]string(nil), profile.DockerImageArtifact.URLs...)
|
||||
}
|
||||
if profile.DockerImageArtifact == nil || strings.TrimSpace(profile.DockerImageArtifact.FileName) == "" {
|
||||
return nil
|
||||
}
|
||||
out := []string{}
|
||||
fileName := strings.TrimLeft(strings.TrimSpace(profile.DockerImageArtifact.FileName), "/")
|
||||
for _, endpoint := range profile.ArtifactEndpoints {
|
||||
if trimmed := strings.TrimRight(strings.TrimSpace(endpoint), "/"); trimmed != "" {
|
||||
out = append(out, trimmed+"/"+fileName)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func dockerArtifactSHA256(profile DockerInstallProfile) string {
|
||||
if profile.DockerImageArtifact == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(profile.DockerImageArtifact.SHA256)
|
||||
}
|
||||
|
||||
func dockerArtifactSizeBytes(profile DockerInstallProfile) int64 {
|
||||
if profile.DockerImageArtifact == nil {
|
||||
return 0
|
||||
}
|
||||
return profile.DockerImageArtifact.SizeBytes
|
||||
}
|
||||
@@ -0,0 +1,258 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type HostAgentUpdateRequest struct {
|
||||
BackendURL string
|
||||
ClusterID string
|
||||
NodeID string
|
||||
StateDir string
|
||||
CurrentVersion string
|
||||
Channel string
|
||||
OS string
|
||||
Arch string
|
||||
InstallType string
|
||||
BinaryPath string
|
||||
DryRun bool
|
||||
RestartService string
|
||||
RestartAfterApply bool
|
||||
}
|
||||
|
||||
type HostAgentUpdateLoopConfig struct {
|
||||
Request HostAgentUpdateRequest
|
||||
Interval time.Duration
|
||||
InitialDelay time.Duration
|
||||
Jitter float64
|
||||
MaxRuns int
|
||||
StopOnError bool
|
||||
Logf func(format string, args ...any)
|
||||
}
|
||||
|
||||
func (req HostAgentUpdateRequest) updateRequest() UpdateRequest {
|
||||
return UpdateRequest{
|
||||
BackendURL: req.BackendURL,
|
||||
ClusterID: req.ClusterID,
|
||||
NodeID: req.NodeID,
|
||||
StateDir: req.StateDir,
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
OS: firstNonEmpty(req.OS, "linux"),
|
||||
Arch: req.Arch,
|
||||
InstallType: firstNonEmpty(req.InstallType, BinaryUpdateInstallType),
|
||||
Channel: req.Channel,
|
||||
ContainerName: "host-agent-service",
|
||||
DryRun: req.DryRun,
|
||||
}
|
||||
}
|
||||
|
||||
func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUpdateRequest) (UpdateResult, error) {
|
||||
binaryPath := firstNonEmpty(req.BinaryPath, DefaultHostAgentInstallPath)
|
||||
planReq := req.updateRequest()
|
||||
planReq.BinaryDefaults()
|
||||
resolved, err := resolveUpdateRequest(planReq)
|
||||
if err != nil {
|
||||
return UpdateResult{}, err
|
||||
}
|
||||
plan, err := FetchNodeUpdatePlan(ctx, resolved)
|
||||
if err != nil {
|
||||
return UpdateResult{}, err
|
||||
}
|
||||
result := UpdateResult{
|
||||
Action: plan.Action,
|
||||
Reason: plan.Reason,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
ContainerName: "host-agent-service",
|
||||
NewImage: binaryPath,
|
||||
}
|
||||
if plan.Action != "update" {
|
||||
if !req.DryRun {
|
||||
status := statusFromNoopPlan(resolved, plan)
|
||||
status.Product = HostAgentUpdateProduct
|
||||
if status.Payload == nil {
|
||||
status.Payload = map[string]any{}
|
||||
}
|
||||
status.Payload["binary_path"] = binaryPath
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, status)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
if plan.Artifact == nil {
|
||||
err := errors.New("host-agent update plan has no artifact")
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if !isBinaryInstallType(plan.Artifact.InstallType) {
|
||||
err := fmt.Errorf("unsupported host-agent artifact install type %q", plan.Artifact.InstallType)
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if req.DryRun {
|
||||
return result, nil
|
||||
}
|
||||
urls := artifactURLsForBackend(*plan.Artifact, resolved.BackendURL)
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: resolved.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "download",
|
||||
Status: "started",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": binaryPath},
|
||||
})
|
||||
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
|
||||
if err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "download", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
defer os.Remove(path)
|
||||
if err := installHostAgentBinary(path, binaryPath); err != nil {
|
||||
stageErr := stageHostAgentBinary(path, binaryPath)
|
||||
if stageErr == nil {
|
||||
result.RestartNeeded = true
|
||||
_ = saveUpdateState(resolved.StateDir, UpdateState{
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: plan.TargetVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
ContainerName: "host-agent-service",
|
||||
Image: binaryPath,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: resolved.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "apply",
|
||||
Status: "staged",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"binary_path": binaryPath, "staged_path": binaryPath + ".next", "restart_needed": true, "replace_error": err.Error()},
|
||||
})
|
||||
return result, nil
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, statusFromError(resolved, plan, "apply", "failed", fmt.Errorf("%w; stage failed: %v", err, stageErr)))
|
||||
return result, err
|
||||
}
|
||||
result.Loaded = true
|
||||
result.Replaced = true
|
||||
result.RestartNeeded = true
|
||||
_ = saveUpdateState(resolved.StateDir, UpdateState{
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: plan.TargetVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
ContainerName: "host-agent-service",
|
||||
Image: binaryPath,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
_ = ReportNodeUpdateStatus(ctx, resolved.BackendURL, resolved.ClusterID, resolved.NodeID, NodeUpdateStatusRequest{
|
||||
Product: HostAgentUpdateProduct,
|
||||
CurrentVersion: resolved.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "apply",
|
||||
Status: "succeeded",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"binary_path": binaryPath, "restart_needed": true},
|
||||
})
|
||||
if req.RestartAfterApply && strings.TrimSpace(req.RestartService) != "" {
|
||||
runner := m.Runner
|
||||
if runner == nil {
|
||||
runner = ExecRunner{}
|
||||
}
|
||||
_, err = runner.Run(ctx, "systemctl", "restart", req.RestartService)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.RestartNeeded = false
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m DockerManager) RunHostAgentUpdateLoop(ctx context.Context, cfg HostAgentUpdateLoopConfig) error {
|
||||
if cfg.Interval == 0 {
|
||||
cfg.Interval = time.Hour
|
||||
}
|
||||
if cfg.InitialDelay < 0 || cfg.Interval < 0 {
|
||||
return errors.New("host-agent update loop durations must not be negative")
|
||||
}
|
||||
if cfg.Jitter < 0 || cfg.Jitter > 1 {
|
||||
return errors.New("host-agent update loop jitter must be between 0 and 1")
|
||||
}
|
||||
logf := cfg.Logf
|
||||
if logf == nil {
|
||||
logf = func(string, ...any) {}
|
||||
}
|
||||
if cfg.InitialDelay > 0 {
|
||||
if err := sleepContext(ctx, jitteredDuration(cfg.InitialDelay, cfg.Jitter)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
runs := 0
|
||||
req := cfg.Request
|
||||
for {
|
||||
runs++
|
||||
result, err := m.ApplyHostAgentUpdate(ctx, req)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNodeIdentityNotReady) {
|
||||
logf("host_agent_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, req.StateDir)
|
||||
} else {
|
||||
logf("host_agent_update_loop run=%d status=failed error=%v", runs, err)
|
||||
if cfg.StopOnError {
|
||||
return err
|
||||
}
|
||||
}
|
||||
} else {
|
||||
logf("host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t",
|
||||
runs,
|
||||
result.Action,
|
||||
result.Reason,
|
||||
result.TargetVersion,
|
||||
result.NewImage,
|
||||
result.Replaced,
|
||||
result.RestartNeeded,
|
||||
)
|
||||
if result.Action == "update" && result.TargetVersion != "" {
|
||||
req.CurrentVersion = result.TargetVersion
|
||||
}
|
||||
}
|
||||
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
|
||||
return nil
|
||||
}
|
||||
if err := sleepContext(ctx, jitteredDuration(cfg.Interval, cfg.Jitter)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (req *UpdateRequest) BinaryDefaults() {
|
||||
req.Product = firstNonEmpty(req.Product, HostAgentUpdateProduct)
|
||||
req.InstallType = firstNonEmpty(req.InstallType, BinaryUpdateInstallType)
|
||||
req.OS = firstNonEmpty(req.OS, "linux")
|
||||
}
|
||||
|
||||
func isBinaryInstallType(value string) bool {
|
||||
switch strings.TrimSpace(value) {
|
||||
case "", BinaryUpdateInstallType, "windows_binary", "binary", "host_binary", "linux-amd64-binary", "windows-amd64-binary":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func hostAgentInstallTypeFor(nodeInstallType string) string {
|
||||
if strings.TrimSpace(nodeInstallType) == WindowsUpdateInstallType {
|
||||
return "windows_binary"
|
||||
}
|
||||
return BinaryUpdateInstallType
|
||||
}
|
||||
|
||||
func stageHostAgentBinary(sourcePath, binaryPath string) error {
|
||||
return copyFile(sourcePath, binaryPath+".next", 0o755)
|
||||
}
|
||||
@@ -0,0 +1,321 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultHostAgentInstallPath = "/usr/local/bin/rap-host-agent"
|
||||
DefaultSystemdUnitDir = "/etc/systemd/system"
|
||||
)
|
||||
|
||||
type UpdateServiceConfig struct {
|
||||
RuntimeConfig RuntimeConfig
|
||||
Product string
|
||||
CurrentVersion string
|
||||
Channel string
|
||||
IntervalSeconds int
|
||||
InitialDelaySeconds int
|
||||
Jitter float64
|
||||
HealthTimeoutSec int
|
||||
BinaryInstallPath string
|
||||
SourceBinaryPath string
|
||||
UnitDir string
|
||||
ManageSystemd bool
|
||||
DryRun bool
|
||||
InstallSelfUpdater bool
|
||||
SelfUpdateVersion string
|
||||
}
|
||||
|
||||
type UpdateServiceResult struct {
|
||||
Installed bool
|
||||
Started bool
|
||||
UnitName string
|
||||
UnitPath string
|
||||
BinaryPath string
|
||||
Unit string
|
||||
SelfUnitName string
|
||||
SelfUnitPath string
|
||||
SelfUnit string
|
||||
}
|
||||
|
||||
func (m DockerManager) InstallUpdateService(ctx context.Context, cfg UpdateServiceConfig) (UpdateServiceResult, error) {
|
||||
cfg.RuntimeConfig = cfg.RuntimeConfig.Normalize()
|
||||
if cfg.Product == "" {
|
||||
cfg.Product = DefaultUpdateProduct
|
||||
}
|
||||
if cfg.IntervalSeconds == 0 {
|
||||
cfg.IntervalSeconds = 21600
|
||||
}
|
||||
if cfg.Jitter == 0 {
|
||||
cfg.Jitter = 0.15
|
||||
}
|
||||
if cfg.HealthTimeoutSec == 0 {
|
||||
cfg.HealthTimeoutSec = 30
|
||||
}
|
||||
cfg.BinaryInstallPath = firstNonEmpty(cfg.BinaryInstallPath, DefaultHostAgentInstallPath)
|
||||
cfg.UnitDir = firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir)
|
||||
unitName := "rap-host-agent-updater-" + safeUnitSlug(cfg.RuntimeConfig.ContainerName) + ".service"
|
||||
result := UpdateServiceResult{
|
||||
UnitName: unitName,
|
||||
UnitPath: filepath.Join(cfg.UnitDir, unitName),
|
||||
BinaryPath: cfg.BinaryInstallPath,
|
||||
}
|
||||
unit, err := buildUpdateServiceUnit(cfg)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.Unit = unit
|
||||
if cfg.DryRun {
|
||||
if cfg.InstallSelfUpdater {
|
||||
selfUnit, selfUnitName, selfUnitPath, err := buildHostAgentSelfUpdateUnit(cfg)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.SelfUnit = selfUnit
|
||||
result.SelfUnitName = selfUnitName
|
||||
result.SelfUnitPath = selfUnitPath
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
if runtime.GOOS != "linux" && cfg.UnitDir == DefaultSystemdUnitDir {
|
||||
return result, fmt.Errorf("systemd update service install is only supported on linux")
|
||||
}
|
||||
if err := installHostAgentBinary(cfg.SourceBinaryPath, cfg.BinaryInstallPath); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if err := os.MkdirAll(cfg.UnitDir, 0o755); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if err := os.WriteFile(result.UnitPath, []byte(unit), 0o644); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if cfg.InstallSelfUpdater {
|
||||
selfUnit, selfUnitName, selfUnitPath, err := buildHostAgentSelfUpdateUnit(cfg)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
if err := os.WriteFile(selfUnitPath, []byte(selfUnit), 0o644); err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.SelfUnit = selfUnit
|
||||
result.SelfUnitName = selfUnitName
|
||||
result.SelfUnitPath = selfUnitPath
|
||||
}
|
||||
result.Installed = true
|
||||
if cfg.ManageSystemd {
|
||||
runner := m.Runner
|
||||
if runner == nil {
|
||||
runner = ExecRunner{}
|
||||
}
|
||||
if _, err := runner.Run(ctx, "systemctl", "daemon-reload"); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if _, err := runner.Run(ctx, "systemctl", "enable", "--now", unitName); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if cfg.InstallSelfUpdater && result.SelfUnitName != "" {
|
||||
if _, err := runner.Run(ctx, "systemctl", "enable", "--now", result.SelfUnitName); err != nil {
|
||||
return result, err
|
||||
}
|
||||
}
|
||||
result.Started = true
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
|
||||
runtimeCfg := cfg.RuntimeConfig.Normalize()
|
||||
var missing []string
|
||||
if runtimeCfg.BackendURL == "" {
|
||||
missing = append(missing, "backend-url")
|
||||
}
|
||||
if runtimeCfg.ClusterID == "" {
|
||||
missing = append(missing, "cluster-id")
|
||||
}
|
||||
if runtimeCfg.ContainerName == "" {
|
||||
missing = append(missing, "container-name")
|
||||
}
|
||||
if runtimeCfg.StateDir == "" {
|
||||
missing = append(missing, "state-dir")
|
||||
}
|
||||
if len(missing) > 0 {
|
||||
return "", fmt.Errorf("missing required update service settings: %s", strings.Join(missing, ", "))
|
||||
}
|
||||
args := []string{
|
||||
cfg.BinaryInstallPath,
|
||||
"update-loop",
|
||||
"--backend-url", runtimeCfg.BackendURL,
|
||||
"--cluster-id", runtimeCfg.ClusterID,
|
||||
"--state-dir", runtimeCfg.StateDir,
|
||||
"--container-name", runtimeCfg.ContainerName,
|
||||
"--product", firstNonEmpty(cfg.Product, DefaultUpdateProduct),
|
||||
"--current-version", strings.TrimSpace(cfg.CurrentVersion),
|
||||
"--interval-seconds", fmt.Sprintf("%d", cfg.IntervalSeconds),
|
||||
"--initial-delay-seconds", fmt.Sprintf("%d", cfg.InitialDelaySeconds),
|
||||
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
|
||||
"--health-timeout-seconds", fmt.Sprintf("%d", cfg.HealthTimeoutSec),
|
||||
}
|
||||
if strings.TrimSpace(cfg.Channel) != "" {
|
||||
args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
|
||||
}
|
||||
execStart := systemdJoin(args)
|
||||
return fmt.Sprintf(`[Unit]
|
||||
Description=RAP host-agent updater for %s
|
||||
After=network-online.target docker.service
|
||||
Wants=network-online.target
|
||||
Requires=docker.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=%s
|
||||
Restart=always
|
||||
RestartSec=30
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
`, runtimeCfg.ContainerName, execStart), nil
|
||||
}
|
||||
|
||||
func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, string, error) {
|
||||
runtimeCfg := cfg.RuntimeConfig.Normalize()
|
||||
if runtimeCfg.BackendURL == "" || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
|
||||
return "", "", "", fmt.Errorf("backend-url, cluster-id, and state-dir are required for host-agent self updater")
|
||||
}
|
||||
unitName := "rap-host-agent-self-updater.service"
|
||||
unitPath := filepath.Join(firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir), unitName)
|
||||
currentVersion := firstNonEmpty(cfg.SelfUpdateVersion, cfg.CurrentVersion)
|
||||
args := []string{
|
||||
cfg.BinaryInstallPath,
|
||||
"update-host-agent-loop",
|
||||
"--backend-url", runtimeCfg.BackendURL,
|
||||
"--cluster-id", runtimeCfg.ClusterID,
|
||||
"--state-dir", runtimeCfg.StateDir,
|
||||
"--binary-path", firstNonEmpty(cfg.BinaryInstallPath, DefaultHostAgentInstallPath),
|
||||
"--current-version", currentVersion,
|
||||
"--interval-seconds", fmt.Sprintf("%d", cfg.IntervalSeconds),
|
||||
"--initial-delay-seconds", fmt.Sprintf("%d", cfg.InitialDelaySeconds+30),
|
||||
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
|
||||
}
|
||||
if strings.TrimSpace(cfg.Channel) != "" {
|
||||
args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
|
||||
}
|
||||
return fmt.Sprintf(`[Unit]
|
||||
Description=RAP host-agent self updater
|
||||
After=network-online.target docker.service
|
||||
Wants=network-online.target
|
||||
Requires=docker.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=%s
|
||||
Restart=always
|
||||
RestartSec=60
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
`, systemdJoin(args)), unitName, unitPath, nil
|
||||
}
|
||||
|
||||
func installHostAgentBinary(sourcePath, targetPath string) error {
|
||||
sourcePath = strings.TrimSpace(sourcePath)
|
||||
targetPath = strings.TrimSpace(targetPath)
|
||||
if sourcePath == "" {
|
||||
var err error
|
||||
sourcePath, err = os.Executable()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if samePath(sourcePath, targetPath) {
|
||||
return os.Chmod(targetPath, 0o755)
|
||||
}
|
||||
src, err := os.Open(sourcePath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer src.Close()
|
||||
if err := os.MkdirAll(filepath.Dir(targetPath), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
tmp := targetPath + ".tmp"
|
||||
dst, err := os.OpenFile(tmp, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := io.Copy(dst, src); err != nil {
|
||||
_ = dst.Close()
|
||||
_ = os.Remove(tmp)
|
||||
return err
|
||||
}
|
||||
if err := dst.Close(); err != nil {
|
||||
_ = os.Remove(tmp)
|
||||
return err
|
||||
}
|
||||
if err := os.Chmod(tmp, 0o755); err != nil {
|
||||
_ = os.Remove(tmp)
|
||||
return err
|
||||
}
|
||||
return os.Rename(tmp, targetPath)
|
||||
}
|
||||
|
||||
func samePath(a, b string) bool {
|
||||
absA, errA := filepath.Abs(a)
|
||||
absB, errB := filepath.Abs(b)
|
||||
if errA == nil && errB == nil {
|
||||
return absA == absB
|
||||
}
|
||||
return a == b
|
||||
}
|
||||
|
||||
func safeUnitSlug(value string) string {
|
||||
value = strings.ToLower(strings.TrimSpace(value))
|
||||
if value == "" {
|
||||
value = DefaultContainerName
|
||||
}
|
||||
var b strings.Builder
|
||||
lastDash := false
|
||||
for _, r := range value {
|
||||
ok := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
|
||||
if ok {
|
||||
b.WriteRune(r)
|
||||
lastDash = false
|
||||
continue
|
||||
}
|
||||
if !lastDash {
|
||||
b.WriteByte('-')
|
||||
lastDash = true
|
||||
}
|
||||
}
|
||||
out := strings.Trim(b.String(), "-")
|
||||
if out == "" {
|
||||
return DefaultContainerName
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func systemdJoin(args []string) string {
|
||||
out := make([]string, 0, len(args))
|
||||
for _, arg := range args {
|
||||
out = append(out, systemdQuote(arg))
|
||||
}
|
||||
return strings.Join(out, " ")
|
||||
}
|
||||
|
||||
func systemdQuote(arg string) string {
|
||||
if arg == "" {
|
||||
return `""`
|
||||
}
|
||||
if !strings.ContainsAny(arg, " \t\n\"'\\") {
|
||||
return arg
|
||||
}
|
||||
arg = strings.ReplaceAll(arg, `\`, `\\`)
|
||||
arg = strings.ReplaceAll(arg, `"`, `\"`)
|
||||
return `"` + arg + `"`
|
||||
}
|
||||
@@ -0,0 +1,171 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestInstallUpdateServiceWritesSystemdUnit(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
source := filepath.Join(dir, "rap-host-agent-src")
|
||||
if err := os.WriteFile(source, []byte("binary"), 0o755); err != nil {
|
||||
t.Fatalf("write source: %v", err)
|
||||
}
|
||||
unitDir := filepath.Join(dir, "systemd")
|
||||
binaryPath := filepath.Join(dir, "bin", "rap-host-agent")
|
||||
result, err := (DockerManager{}).InstallUpdateService(context.Background(), UpdateServiceConfig{
|
||||
RuntimeConfig: RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
NodeName: "node-a",
|
||||
ContainerName: "rap-node-agent-node-a",
|
||||
StateDir: "/var/lib/rap/nodes/node-a",
|
||||
},
|
||||
CurrentVersion: "0.1.0-current",
|
||||
IntervalSeconds: 60,
|
||||
Jitter: 0.2,
|
||||
SourceBinaryPath: source,
|
||||
BinaryInstallPath: binaryPath,
|
||||
UnitDir: unitDir,
|
||||
ManageSystemd: false,
|
||||
InstallSelfUpdater: true,
|
||||
SelfUpdateVersion: "0.1.0-host",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("install update service: %v", err)
|
||||
}
|
||||
if !result.Installed || result.Started {
|
||||
t.Fatalf("unexpected result: %+v", result)
|
||||
}
|
||||
unit, err := os.ReadFile(result.UnitPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read unit: %v", err)
|
||||
}
|
||||
text := string(unit)
|
||||
for _, want := range []string{
|
||||
"ExecStart=",
|
||||
" update-loop",
|
||||
"--backend-url http://control/api/v1",
|
||||
"--cluster-id cluster-1",
|
||||
"--state-dir /var/lib/rap/nodes/node-a",
|
||||
"--container-name rap-node-agent-node-a",
|
||||
"--current-version 0.1.0-current",
|
||||
"--interval-seconds 60",
|
||||
"Restart=always",
|
||||
} {
|
||||
if !strings.Contains(text, want) {
|
||||
t.Fatalf("unit missing %q:\n%s", want, text)
|
||||
}
|
||||
}
|
||||
if payload, err := os.ReadFile(binaryPath); err != nil || string(payload) != "binary" {
|
||||
t.Fatalf("binary copy = %q, %v", payload, err)
|
||||
}
|
||||
if result.SelfUnitName != "rap-host-agent-self-updater.service" || result.SelfUnitPath == "" {
|
||||
t.Fatalf("self updater result = %+v", result)
|
||||
}
|
||||
selfUnit, err := os.ReadFile(result.SelfUnitPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read self unit: %v", err)
|
||||
}
|
||||
if text := string(selfUnit); !strings.Contains(text, "update-host-agent-loop") || !strings.Contains(text, "--current-version 0.1.0-host") {
|
||||
t.Fatalf("unexpected self unit:\n%s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
|
||||
cfg := WindowsInstallConfig{
|
||||
RuntimeConfig: RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
},
|
||||
NodeID: "node-1",
|
||||
AutoUpdateCurrentVersion: "0.1.2",
|
||||
AutoUpdateIntervalSeconds: 120,
|
||||
AutoUpdateInitialDelaySeconds: 7,
|
||||
AutoUpdateHealthTimeoutSeconds: 11,
|
||||
}
|
||||
result := WindowsInstallResult{
|
||||
NodeName: "win-a",
|
||||
StateDir: `C:\ProgramData\RAP\nodes\win-a`,
|
||||
NodeAgentPath: `C:\Program Files\RAP\win-a\rap-node-agent.exe`,
|
||||
TaskName: "RAP Node Agent win-a",
|
||||
}
|
||||
script := windowsHostAgentUpdateScript(`C:\Program Files\RAP\win-a\rap-host-agent.exe`, cfg, result)
|
||||
for _, want := range []string{
|
||||
":loop",
|
||||
"rap-host-agent.exe.next",
|
||||
"update-loop --backend-url",
|
||||
"--backend-url \"http://control/api/v1\"",
|
||||
"--cluster-id \"cluster-1\"",
|
||||
"--node-id \"node-1\"",
|
||||
"--state-dir \"C:\\ProgramData\\RAP\\nodes\\win-a\"",
|
||||
"--install-type windows_service",
|
||||
"--binary-path \"C:\\Program Files\\RAP\\win-a\\rap-node-agent.exe\"",
|
||||
"--host-agent-binary-path \"C:\\Program Files\\RAP\\win-a\\rap-host-agent.exe\"",
|
||||
"--windows-task-name \"RAP Node Agent win-a\"",
|
||||
"--current-version 0.1.2",
|
||||
"--host-agent-current-version 0.1.2",
|
||||
"--interval-seconds 120",
|
||||
"timeout /t 120",
|
||||
} {
|
||||
if !strings.Contains(script, want) {
|
||||
t.Fatalf("script missing %q:\n%s", want, script)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestWindowsInstallReplaceAllowsExistingNodeWithoutJoinToken(t *testing.T) {
|
||||
result, err := (WindowsManager{}).Install(context.Background(), WindowsInstallConfig{
|
||||
RuntimeConfig: RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
NodeName: "win-a",
|
||||
},
|
||||
InstallDir: `C:\Program Files\RAP\win-a`,
|
||||
Replace: true,
|
||||
DryRun: true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("replace install should not require join token: %v", err)
|
||||
}
|
||||
if result.NodeName != "win-a" || result.NodeAgentPath == "" {
|
||||
t.Fatalf("unexpected dry-run result: %+v", result)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWindowsRepairUpdaterStartsFromUnknownVersion(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
source := filepath.Join(dir, "rap-host-agent.exe")
|
||||
if err := os.WriteFile(source, []byte("binary"), 0o755); err != nil {
|
||||
t.Fatalf("write source: %v", err)
|
||||
}
|
||||
result, err := installWindowsHostAgentUpdater(context.Background(), WindowsManager{Runner: &recordingRunner{}}, WindowsInstallResult{
|
||||
NodeName: "win-a",
|
||||
InstallDir: dir,
|
||||
StateDir: dir,
|
||||
NodeAgentPath: filepath.Join(dir, "rap-node-agent.exe"),
|
||||
TaskName: "RAP Node Agent win-a",
|
||||
StartupMode: "user-task",
|
||||
}, WindowsInstallConfig{
|
||||
RuntimeConfig: RuntimeConfig{
|
||||
BackendURL: "http://control/api/v1",
|
||||
ClusterID: "cluster-1",
|
||||
},
|
||||
Replace: true,
|
||||
AutoUpdateEnabled: true,
|
||||
HostAgentSourcePath: source,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("install updater: %v", err)
|
||||
}
|
||||
script, err := os.ReadFile(filepath.Join(result.InstallDir, "rap-host-agent-update.cmd"))
|
||||
if err != nil {
|
||||
t.Fatalf("read updater script: %v", err)
|
||||
}
|
||||
if !strings.Contains(string(script), "--current-version 0.0.0") {
|
||||
t.Fatalf("repair updater should force unknown current version:\n%s", script)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,947 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultUpdateProduct = "rap-node-agent"
|
||||
HostAgentUpdateProduct = "rap-host-agent"
|
||||
DefaultUpdateInstallType = "docker"
|
||||
BinaryUpdateInstallType = "linux_binary"
|
||||
WindowsUpdateInstallType = "windows_service"
|
||||
UpdateStateFileName = "host-update-state.json"
|
||||
UpdateTriggerFileName = "update-trigger.json"
|
||||
)
|
||||
|
||||
var ErrNodeIdentityNotReady = errors.New("node identity is not approved yet")
|
||||
|
||||
type UpdateRequest struct {
|
||||
BackendURL string
|
||||
ClusterID string
|
||||
NodeID string
|
||||
StateDir string
|
||||
Product string
|
||||
CurrentVersion string
|
||||
OS string
|
||||
Arch string
|
||||
InstallType string
|
||||
Channel string
|
||||
ContainerName string
|
||||
BinaryPath string
|
||||
WindowsTaskName string
|
||||
SystemdUnitName string
|
||||
HealthTimeout time.Duration
|
||||
DryRun bool
|
||||
AllowProductionMesh bool
|
||||
}
|
||||
|
||||
type UpdateResult struct {
|
||||
Action string
|
||||
Reason string
|
||||
TargetVersion string
|
||||
ContainerName string
|
||||
PreviousImageID string
|
||||
NewImage string
|
||||
ContainerID string
|
||||
Loaded bool
|
||||
Replaced bool
|
||||
RolledBack bool
|
||||
RestartNeeded bool
|
||||
}
|
||||
|
||||
type UpdateLoopConfig struct {
|
||||
Request UpdateRequest
|
||||
Interval time.Duration
|
||||
InitialDelay time.Duration
|
||||
Jitter float64
|
||||
MaxRuns int
|
||||
StopOnError bool
|
||||
HostAgentUpdateEnabled bool
|
||||
HostAgentUpdateRequest HostAgentUpdateRequest
|
||||
Logf func(format string, args ...any)
|
||||
}
|
||||
|
||||
type UpdateState struct {
|
||||
Product string `json:"product"`
|
||||
CurrentVersion string `json:"current_version"`
|
||||
TargetVersion string `json:"target_version,omitempty"`
|
||||
ContainerName string `json:"container_name,omitempty"`
|
||||
Image string `json:"image,omitempty"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type UpdateTrigger struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
Generation string `json:"generation"`
|
||||
Products []string `json:"products,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
DeliveryMode string `json:"delivery_mode,omitempty"`
|
||||
SubscriptionStatus string `json:"subscription_status,omitempty"`
|
||||
UpdateServiceNodeID string `json:"update_service_node_id,omitempty"`
|
||||
UpdateServiceStatus string `json:"update_service_status,omitempty"`
|
||||
FallbackPollSeconds int `json:"fallback_poll_seconds,omitempty"`
|
||||
ObservedAt time.Time `json:"observed_at"`
|
||||
}
|
||||
|
||||
type NodeUpdatePlanResponse struct {
|
||||
Plan NodeUpdatePlan `json:"node_update_plan"`
|
||||
}
|
||||
|
||||
type NodeUpdatePlan struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
NodeID string `json:"node_id"`
|
||||
Product string `json:"product"`
|
||||
CurrentVersion string `json:"current_version,omitempty"`
|
||||
Action string `json:"action"`
|
||||
Reason string `json:"reason"`
|
||||
TargetVersion string `json:"target_version,omitempty"`
|
||||
Channel string `json:"channel,omitempty"`
|
||||
Strategy string `json:"strategy,omitempty"`
|
||||
RollbackAllowed bool `json:"rollback_allowed"`
|
||||
HealthWindowSec int `json:"health_window_seconds,omitempty"`
|
||||
Artifact *ReleaseArtifact `json:"artifact,omitempty"`
|
||||
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
|
||||
AuthoritySignature json.RawMessage `json:"authority_signature,omitempty"`
|
||||
ProductionForwarding bool `json:"production_forwarding"`
|
||||
}
|
||||
|
||||
type ReleaseArtifact struct {
|
||||
ID string `json:"id"`
|
||||
ReleaseID string `json:"release_id"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
Product string `json:"product"`
|
||||
Version string `json:"version"`
|
||||
OS string `json:"os"`
|
||||
Arch string `json:"arch"`
|
||||
InstallType string `json:"install_type"`
|
||||
Kind string `json:"kind"`
|
||||
URL string `json:"url"`
|
||||
URLs []string `json:"urls,omitempty"`
|
||||
SHA256 string `json:"sha256"`
|
||||
SizeBytes int64 `json:"size_bytes"`
|
||||
Signature *string `json:"signature,omitempty"`
|
||||
Metadata json.RawMessage `json:"metadata"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
}
|
||||
|
||||
type NodeUpdateStatusRequest struct {
|
||||
Product string `json:"product"`
|
||||
CurrentVersion string `json:"current_version,omitempty"`
|
||||
TargetVersion string `json:"target_version,omitempty"`
|
||||
Phase string `json:"phase"`
|
||||
Status string `json:"status"`
|
||||
AttemptID string `json:"attempt_id,omitempty"`
|
||||
ErrorMessage *string `json:"error_message,omitempty"`
|
||||
RollbackVersion *string `json:"rollback_version,omitempty"`
|
||||
Payload map[string]any `json:"payload,omitempty"`
|
||||
ObservedAt time.Time `json:"observed_at,omitempty"`
|
||||
}
|
||||
|
||||
type dockerInspectContainer struct {
|
||||
ID string `json:"Id"`
|
||||
Image string `json:"Image"`
|
||||
Config struct {
|
||||
Image string `json:"Image"`
|
||||
Env []string `json:"Env"`
|
||||
} `json:"Config"`
|
||||
HostConfig struct {
|
||||
Privileged bool `json:"Privileged"`
|
||||
NetworkMode string `json:"NetworkMode"`
|
||||
CapAdd []string `json:"CapAdd"`
|
||||
Devices []struct {
|
||||
PathOnHost string `json:"PathOnHost"`
|
||||
PathInContainer string `json:"PathInContainer"`
|
||||
CgroupPermissions string `json:"CgroupPermissions"`
|
||||
} `json:"Devices"`
|
||||
RestartPolicy struct {
|
||||
Name string `json:"Name"`
|
||||
} `json:"RestartPolicy"`
|
||||
} `json:"HostConfig"`
|
||||
Mounts []struct {
|
||||
Source string `json:"Source"`
|
||||
Destination string `json:"Destination"`
|
||||
} `json:"Mounts"`
|
||||
State struct {
|
||||
Running bool `json:"Running"`
|
||||
} `json:"State"`
|
||||
}
|
||||
|
||||
func (req UpdateRequest) Normalize() UpdateRequest {
|
||||
req.BackendURL = strings.TrimRight(strings.TrimSpace(req.BackendURL), "/")
|
||||
req.ClusterID = strings.TrimSpace(req.ClusterID)
|
||||
req.NodeID = strings.TrimSpace(req.NodeID)
|
||||
req.StateDir = strings.TrimSpace(req.StateDir)
|
||||
req.Product = firstNonEmpty(req.Product, DefaultUpdateProduct)
|
||||
req.OS = firstNonEmpty(req.OS, runtime.GOOS)
|
||||
req.Arch = firstNonEmpty(req.Arch, runtime.GOARCH)
|
||||
req.InstallType = firstNonEmpty(req.InstallType, DefaultUpdateInstallType)
|
||||
req.Channel = strings.TrimSpace(req.Channel)
|
||||
req.ContainerName = firstNonEmpty(req.ContainerName, DefaultContainerName)
|
||||
req.BinaryPath = strings.TrimSpace(req.BinaryPath)
|
||||
req.WindowsTaskName = strings.TrimSpace(req.WindowsTaskName)
|
||||
req.SystemdUnitName = strings.TrimSpace(req.SystemdUnitName)
|
||||
if req.HealthTimeout == 0 {
|
||||
req.HealthTimeout = 30 * time.Second
|
||||
}
|
||||
return req
|
||||
}
|
||||
|
||||
func (req UpdateRequest) Validate() error {
|
||||
req = req.Normalize()
|
||||
var missing []string
|
||||
if req.BackendURL == "" {
|
||||
missing = append(missing, "backend-url")
|
||||
}
|
||||
if req.ClusterID == "" {
|
||||
missing = append(missing, "cluster-id")
|
||||
}
|
||||
if req.NodeID == "" && req.StateDir == "" {
|
||||
missing = append(missing, "node-id-or-state-dir")
|
||||
}
|
||||
if req.InstallType == WindowsUpdateInstallType {
|
||||
if req.BinaryPath == "" {
|
||||
missing = append(missing, "binary-path")
|
||||
}
|
||||
if req.WindowsTaskName == "" {
|
||||
missing = append(missing, "windows-task-name")
|
||||
}
|
||||
} else if req.InstallType == BinaryUpdateInstallType && req.Product != HostAgentUpdateProduct {
|
||||
if req.BinaryPath == "" {
|
||||
missing = append(missing, "binary-path")
|
||||
}
|
||||
if req.SystemdUnitName == "" {
|
||||
missing = append(missing, "systemd-unit")
|
||||
}
|
||||
} else if req.ContainerName == "" {
|
||||
missing = append(missing, "container-name")
|
||||
}
|
||||
if len(missing) > 0 {
|
||||
return fmt.Errorf("missing required update settings: %s", strings.Join(missing, ", "))
|
||||
}
|
||||
if req.HealthTimeout < 0 {
|
||||
return errors.New("health timeout must not be negative")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m DockerManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (UpdateResult, error) {
|
||||
req = req.Normalize()
|
||||
var err error
|
||||
req, err = resolveUpdateRequest(req)
|
||||
if err != nil {
|
||||
return UpdateResult{}, err
|
||||
}
|
||||
runner := m.Runner
|
||||
if runner == nil {
|
||||
runner = ExecRunner{}
|
||||
}
|
||||
docker := firstNonEmpty(m.Binary, "docker")
|
||||
|
||||
plan, err := FetchNodeUpdatePlan(ctx, req)
|
||||
if err != nil {
|
||||
return UpdateResult{}, err
|
||||
}
|
||||
if plan.HealthWindowSec > 0 && req.HealthTimeout == 30*time.Second {
|
||||
req.HealthTimeout = time.Duration(plan.HealthWindowSec) * time.Second
|
||||
}
|
||||
result := UpdateResult{
|
||||
Action: plan.Action,
|
||||
Reason: plan.Reason,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
ContainerName: req.ContainerName,
|
||||
}
|
||||
if plan.Action != "update" {
|
||||
if !req.DryRun {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromNoopPlan(req, plan))
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
if plan.ProductionForwarding && !req.AllowProductionMesh {
|
||||
err := errors.New("refusing update plan with production forwarding enabled")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact == nil {
|
||||
err := errors.New("update plan has no artifact")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != DefaultUpdateInstallType {
|
||||
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if req.DryRun {
|
||||
result.NewImage = artifactImage(*plan.Artifact, "")
|
||||
return result, nil
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "planned",
|
||||
Status: "accepted",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"strategy": plan.Strategy, "reason": plan.Reason},
|
||||
})
|
||||
|
||||
current, cfg, err := m.runtimeConfigFromContainer(ctx, runner, docker, req.ContainerName)
|
||||
if err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "inspect", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
result.PreviousImageID = current.Image
|
||||
cfg.BackendURL = firstNonEmpty(cfg.BackendURL, req.BackendURL)
|
||||
cfg.ClusterID = firstNonEmpty(cfg.ClusterID, req.ClusterID)
|
||||
cfg.ContainerName = req.ContainerName
|
||||
cfg.Image = artifactImage(*plan.Artifact, cfg.Image)
|
||||
cfg.ImageArtifactURLs = artifactURLsForBackend(*plan.Artifact, req.BackendURL)
|
||||
cfg.ImageArtifactSHA256 = plan.Artifact.SHA256
|
||||
cfg.ImageArtifactSizeBytes = plan.Artifact.SizeBytes
|
||||
cfg.Replace = true
|
||||
cfg.JoinToken = ""
|
||||
result.NewImage = cfg.Image
|
||||
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "download",
|
||||
Status: "started",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": cfg.ImageArtifactURLs, "image": cfg.Image},
|
||||
})
|
||||
installed, err := m.Install(ctx, cfg)
|
||||
if err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
|
||||
rollbackErr := m.rollbackContainer(ctx, runner, docker, cfg, current, plan.RollbackAllowed)
|
||||
if rollbackErr == nil && plan.RollbackAllowed {
|
||||
result.RolledBack = true
|
||||
}
|
||||
return result, err
|
||||
}
|
||||
result.Loaded = installed.Loaded
|
||||
result.Replaced = installed.Replaced
|
||||
result.ContainerID = installed.ContainerID
|
||||
|
||||
if err := m.waitContainerRunning(ctx, runner, docker, req.ContainerName, req.HealthTimeout); err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "health_check", "failed", err))
|
||||
rollbackErr := m.rollbackContainer(ctx, runner, docker, cfg, current, plan.RollbackAllowed)
|
||||
if rollbackErr == nil && plan.RollbackAllowed {
|
||||
result.RolledBack = true
|
||||
}
|
||||
return result, err
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "health_check",
|
||||
Status: "succeeded",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"container_id": installed.ContainerID, "image": cfg.Image},
|
||||
})
|
||||
_ = saveUpdateState(req.StateDir, UpdateState{
|
||||
Product: req.Product,
|
||||
CurrentVersion: plan.TargetVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
ContainerName: req.ContainerName,
|
||||
Image: cfg.Image,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m DockerManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) error {
|
||||
req := cfg.Request.Normalize()
|
||||
if err := req.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if cfg.Interval == 0 {
|
||||
cfg.Interval = time.Hour
|
||||
}
|
||||
if cfg.Interval < 0 {
|
||||
return errors.New("update loop interval must not be negative")
|
||||
}
|
||||
if cfg.InitialDelay < 0 {
|
||||
return errors.New("update loop initial delay must not be negative")
|
||||
}
|
||||
if cfg.Jitter < 0 || cfg.Jitter > 1 {
|
||||
return errors.New("update loop jitter must be between 0 and 1")
|
||||
}
|
||||
logf := cfg.Logf
|
||||
if logf == nil {
|
||||
logf = func(string, ...any) {}
|
||||
}
|
||||
if cfg.InitialDelay > 0 {
|
||||
if err := sleepContext(ctx, jitteredDuration(cfg.InitialDelay, cfg.Jitter)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
runs := 0
|
||||
lastTriggerGeneration := currentUpdateTriggerGeneration(req.StateDir)
|
||||
for {
|
||||
runs++
|
||||
result, err := m.ApplyUpdate(ctx, req)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNodeIdentityNotReady) {
|
||||
logf("update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, req.StateDir)
|
||||
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
|
||||
return nil
|
||||
}
|
||||
if err := sleepContext(ctx, jitteredDuration(cfg.Interval, cfg.Jitter)); err != nil {
|
||||
return err
|
||||
}
|
||||
continue
|
||||
}
|
||||
logf("update_loop run=%d status=failed error=%v", runs, err)
|
||||
if cfg.StopOnError {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
logf("update_loop run=%d action=%s reason=%s target=%s container=%s loaded=%t replaced=%t rolled_back=%t",
|
||||
runs,
|
||||
result.Action,
|
||||
result.Reason,
|
||||
result.TargetVersion,
|
||||
result.ContainerName,
|
||||
result.Loaded,
|
||||
result.Replaced,
|
||||
result.RolledBack,
|
||||
)
|
||||
if result.Action == "update" && result.TargetVersion != "" && !result.RolledBack {
|
||||
req.CurrentVersion = result.TargetVersion
|
||||
}
|
||||
}
|
||||
if cfg.HostAgentUpdateEnabled {
|
||||
hostReq := cfg.HostAgentUpdateRequest
|
||||
hostReq.BackendURL = firstNonEmpty(hostReq.BackendURL, req.BackendURL)
|
||||
hostReq.ClusterID = firstNonEmpty(hostReq.ClusterID, req.ClusterID)
|
||||
hostReq.NodeID = firstNonEmpty(hostReq.NodeID, req.NodeID)
|
||||
hostReq.StateDir = firstNonEmpty(hostReq.StateDir, req.StateDir)
|
||||
hostReq.Channel = firstNonEmpty(hostReq.Channel, req.Channel)
|
||||
hostReq.CurrentVersion = firstNonEmpty(hostReq.CurrentVersion, req.CurrentVersion)
|
||||
hostReq.OS = firstNonEmpty(hostReq.OS, req.OS)
|
||||
hostReq.Arch = firstNonEmpty(hostReq.Arch, req.Arch)
|
||||
hostReq.InstallType = firstNonEmpty(hostReq.InstallType, hostAgentInstallTypeFor(req.InstallType))
|
||||
result, err := m.ApplyHostAgentUpdate(ctx, hostReq)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNodeIdentityNotReady) {
|
||||
logf("host_agent_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, hostReq.StateDir)
|
||||
} else {
|
||||
logf("host_agent_update_loop run=%d status=failed error=%v", runs, err)
|
||||
if cfg.StopOnError {
|
||||
return err
|
||||
}
|
||||
}
|
||||
} else {
|
||||
logf("host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t",
|
||||
runs,
|
||||
result.Action,
|
||||
result.Reason,
|
||||
result.TargetVersion,
|
||||
result.NewImage,
|
||||
result.Replaced,
|
||||
result.RestartNeeded,
|
||||
)
|
||||
if result.Action == "update" && result.TargetVersion != "" {
|
||||
cfg.HostAgentUpdateRequest.CurrentVersion = result.TargetVersion
|
||||
}
|
||||
if result.RestartNeeded {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
|
||||
return nil
|
||||
}
|
||||
if err := sleepUntilUpdateIntervalOrTrigger(ctx, req.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func FetchNodeUpdatePlan(ctx context.Context, req UpdateRequest) (NodeUpdatePlan, error) {
|
||||
var err error
|
||||
req, err = resolveUpdateRequest(req)
|
||||
if err != nil {
|
||||
return NodeUpdatePlan{}, err
|
||||
}
|
||||
values := url.Values{}
|
||||
values.Set("product", req.Product)
|
||||
values.Set("current_version", req.CurrentVersion)
|
||||
values.Set("os", req.OS)
|
||||
values.Set("arch", req.Arch)
|
||||
values.Set("install_type", req.InstallType)
|
||||
if req.Channel != "" {
|
||||
values.Set("channel", req.Channel)
|
||||
}
|
||||
endpoint := fmt.Sprintf("%s/clusters/%s/nodes/%s/updates/plan?%s", req.BackendURL, url.PathEscape(req.ClusterID), url.PathEscape(req.NodeID), values.Encode())
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil)
|
||||
if err != nil {
|
||||
return NodeUpdatePlan{}, err
|
||||
}
|
||||
resp, err := http.DefaultClient.Do(httpReq)
|
||||
if err != nil {
|
||||
return NodeUpdatePlan{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return NodeUpdatePlan{}, fmt.Errorf("fetch update plan: %s", resp.Status)
|
||||
}
|
||||
var out NodeUpdatePlanResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&out); err != nil {
|
||||
return NodeUpdatePlan{}, err
|
||||
}
|
||||
return out.Plan, nil
|
||||
}
|
||||
|
||||
func resolveUpdateRequest(req UpdateRequest) (UpdateRequest, error) {
|
||||
req = req.Normalize()
|
||||
if err := req.Validate(); err != nil {
|
||||
return UpdateRequest{}, err
|
||||
}
|
||||
if req.NodeID == "" {
|
||||
identity, err := state.Load(filepath.Join(req.StateDir, state.FileName))
|
||||
if err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
return UpdateRequest{}, ErrNodeIdentityNotReady
|
||||
}
|
||||
return UpdateRequest{}, err
|
||||
}
|
||||
if strings.TrimSpace(identity.NodeID) == "" {
|
||||
return UpdateRequest{}, ErrNodeIdentityNotReady
|
||||
}
|
||||
req.NodeID = strings.TrimSpace(identity.NodeID)
|
||||
if req.ClusterID == "" {
|
||||
req.ClusterID = strings.TrimSpace(identity.ClusterID)
|
||||
}
|
||||
}
|
||||
if updateState, err := loadUpdateState(req.StateDir, req.Product); err == nil && updateState.Product == req.Product && updateState.CurrentVersion != "" {
|
||||
req.CurrentVersion = updateState.CurrentVersion
|
||||
}
|
||||
return req, nil
|
||||
}
|
||||
|
||||
func ReportNodeUpdateStatus(ctx context.Context, backendURL, clusterID, nodeID string, request NodeUpdateStatusRequest) error {
|
||||
backendURL = strings.TrimRight(strings.TrimSpace(backendURL), "/")
|
||||
endpoint := fmt.Sprintf("%s/clusters/%s/nodes/%s/updates/status", backendURL, url.PathEscape(clusterID), url.PathEscape(nodeID))
|
||||
body, err := json.Marshal(request)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
resp, err := http.DefaultClient.Do(httpReq)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return fmt.Errorf("report update status: %s", resp.Status)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m DockerManager) runtimeConfigFromContainer(ctx context.Context, runner CommandRunner, docker, containerName string) (dockerInspectContainer, RuntimeConfig, error) {
|
||||
out, err := runner.Run(ctx, docker, "inspect", containerName)
|
||||
if err != nil {
|
||||
return dockerInspectContainer{}, RuntimeConfig{}, err
|
||||
}
|
||||
var inspected []dockerInspectContainer
|
||||
if err := json.Unmarshal([]byte(out), &inspected); err != nil {
|
||||
return dockerInspectContainer{}, RuntimeConfig{}, err
|
||||
}
|
||||
if len(inspected) == 0 {
|
||||
return dockerInspectContainer{}, RuntimeConfig{}, fmt.Errorf("container %q not found", containerName)
|
||||
}
|
||||
env := envMap(inspected[0].Config.Env)
|
||||
cfg := RuntimeConfig{
|
||||
BackendURL: env["RAP_BACKEND_URL"],
|
||||
ClusterID: env["RAP_CLUSTER_ID"],
|
||||
NodeName: firstNonEmpty(env["RAP_NODE_NAME"], containerName),
|
||||
Image: inspected[0].Config.Image,
|
||||
ContainerName: containerName,
|
||||
StateDir: hostStateDir(inspected[0]),
|
||||
Network: firstNonEmpty(inspected[0].HostConfig.NetworkMode, DefaultNetwork),
|
||||
RestartPolicy: firstNonEmpty(inspected[0].HostConfig.RestartPolicy.Name, "unless-stopped"),
|
||||
WorkloadSupervisionEnabled: parseBool(env["RAP_WORKLOAD_SUPERVISION_ENABLED"]),
|
||||
MeshSyntheticRuntimeEnabled: true,
|
||||
MeshProductionForwardingEnabled: parseBool(env["RAP_MESH_PRODUCTION_FORWARDING_ENABLED"]),
|
||||
MeshListenAddr: env["RAP_MESH_LISTEN_ADDR"],
|
||||
MeshListenPortMode: env["RAP_MESH_LISTEN_PORT_MODE"],
|
||||
MeshListenAutoPortStart: parseInt(env["RAP_MESH_LISTEN_AUTO_PORT_START"]),
|
||||
MeshListenAutoPortEnd: parseInt(env["RAP_MESH_LISTEN_AUTO_PORT_END"]),
|
||||
MeshAdvertiseEndpoint: env["RAP_MESH_ADVERTISE_ENDPOINT"],
|
||||
MeshAdvertiseEndpointsJSON: env["RAP_MESH_ADVERTISE_ENDPOINTS_JSON"],
|
||||
MeshAdvertiseTransport: env["RAP_MESH_ADVERTISE_TRANSPORT"],
|
||||
MeshConnectivityMode: env["RAP_MESH_CONNECTIVITY_MODE"],
|
||||
MeshNATType: env["RAP_MESH_NAT_TYPE"],
|
||||
MeshRegion: env["RAP_MESH_REGION"],
|
||||
HeartbeatIntervalSeconds: parseInt(env["RAP_HEARTBEAT_INTERVAL_SECONDS"]),
|
||||
EnrollmentPollIntervalSeconds: parseInt(env["RAP_ENROLLMENT_POLL_INTERVAL_SECONDS"]),
|
||||
EnrollmentPollTimeoutSeconds: parseInt(env["RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS"]),
|
||||
ProductionObservationSinkCap: parseInt(env["RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY"]),
|
||||
DockerVPNGatewayEnabled: dockerInspectHasVPNGatewayRuntime(inspected[0]),
|
||||
}
|
||||
return inspected[0], cfg.Normalize(), nil
|
||||
}
|
||||
|
||||
func dockerInspectHasVPNGatewayRuntime(container dockerInspectContainer) bool {
|
||||
hasNetAdmin := false
|
||||
for _, cap := range container.HostConfig.CapAdd {
|
||||
if strings.EqualFold(strings.TrimSpace(cap), "NET_ADMIN") {
|
||||
hasNetAdmin = true
|
||||
break
|
||||
}
|
||||
}
|
||||
hasTun := false
|
||||
for _, device := range container.HostConfig.Devices {
|
||||
if device.PathOnHost == "/dev/net/tun" || device.PathInContainer == "/dev/net/tun" {
|
||||
hasTun = true
|
||||
break
|
||||
}
|
||||
}
|
||||
return (container.HostConfig.Privileged || hasNetAdmin) && hasTun
|
||||
}
|
||||
|
||||
func (m DockerManager) waitContainerRunning(ctx context.Context, runner CommandRunner, docker, containerName string, timeout time.Duration) error {
|
||||
deadline := time.Now().Add(timeout)
|
||||
for {
|
||||
out, err := runner.Run(ctx, docker, "inspect", "--format", "{{.State.Running}}", containerName)
|
||||
if err == nil && strings.TrimSpace(out) == "true" {
|
||||
return nil
|
||||
}
|
||||
if timeout == 0 || time.Now().After(deadline) {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("container %q is not running", containerName)
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(time.Second):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (m DockerManager) rollbackContainer(ctx context.Context, runner CommandRunner, docker string, cfg RuntimeConfig, previous dockerInspectContainer, allowed bool) error {
|
||||
if !allowed || strings.TrimSpace(previous.Image) == "" {
|
||||
return nil
|
||||
}
|
||||
rollbackCfg := cfg
|
||||
rollbackCfg.Image = previous.Image
|
||||
rollbackCfg.ImageArtifactURLs = nil
|
||||
rollbackCfg.ImageArtifactSHA256 = ""
|
||||
rollbackCfg.ImageArtifactSizeBytes = 0
|
||||
rollbackCfg.Replace = true
|
||||
_, err := m.Install(ctx, rollbackCfg)
|
||||
if err == nil {
|
||||
_, _ = runner.Run(ctx, docker, "inspect", "--format", "{{.State.Running}}", cfg.ContainerName)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func artifactImage(artifact ReleaseArtifact, fallback string) string {
|
||||
if len(artifact.Metadata) > 0 {
|
||||
var metadata struct {
|
||||
Image string `json:"image"`
|
||||
}
|
||||
if err := json.Unmarshal(artifact.Metadata, &metadata); err == nil && strings.TrimSpace(metadata.Image) != "" {
|
||||
return strings.TrimSpace(metadata.Image)
|
||||
}
|
||||
}
|
||||
if artifact.InstallType == DefaultUpdateInstallType && artifact.Product != "" && artifact.Version != "" {
|
||||
return strings.TrimSpace(artifact.Product) + ":" + strings.TrimSpace(artifact.Version)
|
||||
}
|
||||
return firstNonEmpty(fallback, DefaultImage)
|
||||
}
|
||||
|
||||
func artifactURLs(artifact ReleaseArtifact) []string {
|
||||
out := make([]string, 0, 1+len(artifact.URLs))
|
||||
for _, raw := range append([]string{artifact.URL}, artifact.URLs...) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" || containsArtifactURL(out, raw) {
|
||||
continue
|
||||
}
|
||||
out = append(out, raw)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func artifactURLsForBackend(artifact ReleaseArtifact, backendURL string) []string {
|
||||
urls := artifactURLs(artifact)
|
||||
base, err := url.Parse(strings.TrimSpace(backendURL))
|
||||
if err != nil || base.Scheme == "" || base.Host == "" {
|
||||
return urls
|
||||
}
|
||||
origin := base.Scheme + "://" + base.Host
|
||||
out := make([]string, 0, len(urls))
|
||||
for _, raw := range urls {
|
||||
if strings.HasPrefix(raw, "/") {
|
||||
raw = origin + raw
|
||||
}
|
||||
if !containsArtifactURL(out, raw) {
|
||||
out = append(out, raw)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func containsArtifactURL(values []string, value string) bool {
|
||||
for _, item := range values {
|
||||
if item == value {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func statusFromError(req UpdateRequest, plan NodeUpdatePlan, phase, status string, err error) NodeUpdateStatusRequest {
|
||||
message := err.Error()
|
||||
return NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: phase,
|
||||
Status: status,
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ErrorMessage: &message,
|
||||
ObservedAt: time.Now().UTC(),
|
||||
}
|
||||
}
|
||||
|
||||
func statusFromNoopPlan(req UpdateRequest, plan NodeUpdatePlan) NodeUpdateStatusRequest {
|
||||
return NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "plan",
|
||||
Status: "noop",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{
|
||||
"action": plan.Action,
|
||||
"reason": plan.Reason,
|
||||
"strategy": plan.Strategy,
|
||||
"channel": plan.Channel,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func updateAttemptID(plan NodeUpdatePlan) string {
|
||||
parts := []string{plan.NodeID, plan.Product, plan.TargetVersion}
|
||||
if plan.Artifact != nil {
|
||||
parts = append(parts, plan.Artifact.ID)
|
||||
}
|
||||
return strings.Join(parts, ":")
|
||||
}
|
||||
|
||||
func envMap(items []string) map[string]string {
|
||||
out := map[string]string{}
|
||||
for _, item := range items {
|
||||
key, value, ok := strings.Cut(item, "=")
|
||||
if ok {
|
||||
out[key] = value
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func hostStateDir(container dockerInspectContainer) string {
|
||||
for _, mount := range container.Mounts {
|
||||
if mount.Destination == "/var/lib/rap-node-agent" && mount.Source != "" {
|
||||
return mount.Source
|
||||
}
|
||||
}
|
||||
return DefaultStateDir
|
||||
}
|
||||
|
||||
func parseBool(value string) bool {
|
||||
switch strings.ToLower(strings.TrimSpace(value)) {
|
||||
case "1", "true", "yes", "y", "on":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func parseInt(value string) int {
|
||||
out, _ := strconv.Atoi(strings.TrimSpace(value))
|
||||
return out
|
||||
}
|
||||
|
||||
func loadUpdateState(stateDir string, product string) (UpdateState, error) {
|
||||
stateDir = strings.TrimSpace(stateDir)
|
||||
if stateDir == "" {
|
||||
return UpdateState{}, os.ErrNotExist
|
||||
}
|
||||
product = firstNonEmpty(normalizeUpdateProductToken(product), DefaultUpdateProduct)
|
||||
payload, err := os.ReadFile(updateStatePath(stateDir, product))
|
||||
if err != nil && product == DefaultUpdateProduct {
|
||||
payload, err = os.ReadFile(filepath.Join(stateDir, UpdateStateFileName))
|
||||
}
|
||||
if err != nil {
|
||||
return UpdateState{}, err
|
||||
}
|
||||
var item UpdateState
|
||||
if err := json.Unmarshal(payload, &item); err != nil {
|
||||
return UpdateState{}, err
|
||||
}
|
||||
item.Product = firstNonEmpty(item.Product, product)
|
||||
return item, nil
|
||||
}
|
||||
|
||||
func saveUpdateState(stateDir string, item UpdateState) error {
|
||||
stateDir = strings.TrimSpace(stateDir)
|
||||
if stateDir == "" || item.CurrentVersion == "" {
|
||||
return nil
|
||||
}
|
||||
item.Product = firstNonEmpty(item.Product, DefaultUpdateProduct)
|
||||
if item.UpdatedAt.IsZero() {
|
||||
item.UpdatedAt = time.Now().UTC()
|
||||
}
|
||||
if err := os.MkdirAll(stateDir, 0o700); err != nil {
|
||||
return err
|
||||
}
|
||||
payload, err := json.MarshalIndent(item, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(updateStatePath(stateDir, item.Product), payload, 0o600)
|
||||
}
|
||||
|
||||
func updateStatePath(stateDir, product string) string {
|
||||
product = normalizeUpdateProductToken(firstNonEmpty(product, DefaultUpdateProduct))
|
||||
if product == "" || product == DefaultUpdateProduct {
|
||||
return filepath.Join(stateDir, UpdateStateFileName)
|
||||
}
|
||||
return filepath.Join(stateDir, "host-update-state-"+product+".json")
|
||||
}
|
||||
|
||||
func UpdateTriggerPath(stateDir string) string {
|
||||
return filepath.Join(strings.TrimSpace(stateDir), UpdateTriggerFileName)
|
||||
}
|
||||
|
||||
func SaveUpdateTrigger(stateDir string, trigger UpdateTrigger) error {
|
||||
stateDir = strings.TrimSpace(stateDir)
|
||||
trigger.Generation = strings.TrimSpace(trigger.Generation)
|
||||
if stateDir == "" || trigger.Generation == "" {
|
||||
return nil
|
||||
}
|
||||
if trigger.SchemaVersion == "" {
|
||||
trigger.SchemaVersion = "rap.node_update_trigger.v1"
|
||||
}
|
||||
if trigger.ObservedAt.IsZero() {
|
||||
trigger.ObservedAt = time.Now().UTC()
|
||||
}
|
||||
if err := os.MkdirAll(stateDir, 0o700); err != nil {
|
||||
return err
|
||||
}
|
||||
payload, err := json.MarshalIndent(trigger, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(UpdateTriggerPath(stateDir), payload, 0o600)
|
||||
}
|
||||
|
||||
func currentUpdateTriggerGeneration(stateDir string) string {
|
||||
payload, err := os.ReadFile(UpdateTriggerPath(stateDir))
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
var trigger UpdateTrigger
|
||||
if err := json.Unmarshal(payload, &trigger); err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(trigger.Generation)
|
||||
}
|
||||
|
||||
func CurrentUpdateTriggerGenerationForNodeAgent(stateDir string) string {
|
||||
return currentUpdateTriggerGeneration(stateDir)
|
||||
}
|
||||
|
||||
func normalizeUpdateProductToken(value string) string {
|
||||
value = strings.ToLower(strings.TrimSpace(value))
|
||||
var b strings.Builder
|
||||
for _, r := range value {
|
||||
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r == '_' || r == '.' {
|
||||
b.WriteRune(r)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func sleepContext(ctx context.Context, duration time.Duration) error {
|
||||
if duration <= 0 {
|
||||
return nil
|
||||
}
|
||||
timer := time.NewTimer(duration)
|
||||
defer timer.Stop()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-timer.C:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func sleepUntilUpdateIntervalOrTrigger(ctx context.Context, stateDir string, duration time.Duration, lastGeneration *string) error {
|
||||
if duration <= 0 {
|
||||
return nil
|
||||
}
|
||||
deadline := time.NewTimer(duration)
|
||||
defer deadline.Stop()
|
||||
ticker := time.NewTicker(5 * time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-deadline.C:
|
||||
return nil
|
||||
case <-ticker.C:
|
||||
generation := currentUpdateTriggerGeneration(stateDir)
|
||||
if generation != "" && lastGeneration != nil && generation != *lastGeneration {
|
||||
*lastGeneration = generation
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func jitteredDuration(base time.Duration, jitter float64) time.Duration {
|
||||
if base <= 0 || jitter <= 0 {
|
||||
return base
|
||||
}
|
||||
spread := int64(float64(base) * jitter)
|
||||
if spread <= 0 {
|
||||
return base
|
||||
}
|
||||
offset := rand.Int63n(spread*2+1) - spread
|
||||
return base + time.Duration(offset)
|
||||
}
|
||||
@@ -0,0 +1,672 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
|
||||
)
|
||||
|
||||
type updateRunner struct {
|
||||
calls [][]string
|
||||
healthOkay bool
|
||||
inspectJSON string
|
||||
}
|
||||
|
||||
func TestArtifactURLsForBackendResolvesControlPlaneRelativeDownloads(t *testing.T) {
|
||||
urls := artifactURLsForBackend(ReleaseArtifact{
|
||||
URL: "/downloads/rap-node-agent-0.2.92.tar",
|
||||
URLs: []string{"/downloads/mirror.tar", "https://cdn.example.test/agent.tar"},
|
||||
}, "http://control.example.test:18080/api/v1")
|
||||
want := []string{
|
||||
"http://control.example.test:18080/downloads/rap-node-agent-0.2.92.tar",
|
||||
"http://control.example.test:18080/downloads/mirror.tar",
|
||||
"https://cdn.example.test/agent.tar",
|
||||
}
|
||||
if len(urls) != len(want) {
|
||||
t.Fatalf("urls = %#v", urls)
|
||||
}
|
||||
for i := range want {
|
||||
if urls[i] != want[i] {
|
||||
t.Fatalf("urls[%d] = %q, want %q; all=%#v", i, urls[i], want[i], urls)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (r *updateRunner) Run(_ context.Context, name string, args ...string) (string, error) {
|
||||
r.calls = append(r.calls, append([]string{name}, args...))
|
||||
if len(args) >= 2 && args[0] == "inspect" && args[1] == "--format" {
|
||||
if r.healthOkay {
|
||||
return "true\n", nil
|
||||
}
|
||||
return "false\n", nil
|
||||
}
|
||||
if len(args) == 2 && args[0] == "inspect" {
|
||||
return r.inspectJSON, nil
|
||||
}
|
||||
if len(args) >= 2 && args[0] == "image" && args[1] == "inspect" {
|
||||
return "[]", nil
|
||||
}
|
||||
if len(args) > 0 && args[0] == "run" {
|
||||
return "updated-container\n", nil
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func TestApplyUpdateFetchesPlanLoadsImageAndRecreatesContainer(t *testing.T) {
|
||||
artifactBody := []byte("fake docker image tar")
|
||||
statuses := []NodeUpdateStatusRequest{}
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"node_update_plan": map[string]any{
|
||||
"schema_version": "rap.node_update_plan.v1",
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-1",
|
||||
"product": "rap-node-agent",
|
||||
"current_version": "0.1.0-old",
|
||||
"action": "update",
|
||||
"reason": "matching_release_available",
|
||||
"target_version": "0.1.0-new",
|
||||
"rollback_allowed": true,
|
||||
"health_window_seconds": 1,
|
||||
"production_forwarding": false,
|
||||
"artifact": map[string]any{
|
||||
"id": "artifact-1",
|
||||
"product": "rap-node-agent",
|
||||
"version": "0.1.0-new",
|
||||
"os": "linux",
|
||||
"arch": "amd64",
|
||||
"install_type": "docker",
|
||||
"url": serverArtifactURL(r),
|
||||
"sha256": "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
|
||||
"size_bytes": len(artifactBody),
|
||||
"metadata": map[string]any{"image": "rap-node-agent:test-new"},
|
||||
},
|
||||
},
|
||||
})
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
|
||||
var status NodeUpdateStatusRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&status); err != nil {
|
||||
t.Fatalf("decode status: %v", err)
|
||||
}
|
||||
statuses = append(statuses, status)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
|
||||
case r.Method == http.MethodGet && r.URL.Path == "/artifact.tar":
|
||||
_, _ = w.Write(artifactBody)
|
||||
default:
|
||||
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
runner := &updateRunner{healthOkay: true, inspectJSON: dockerInspectFixture(server.URL)}
|
||||
result, err := (DockerManager{Runner: runner}).ApplyUpdate(context.Background(), UpdateRequest{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-1",
|
||||
CurrentVersion: "0.1.0-old",
|
||||
ContainerName: "rap-node-agent-node-1",
|
||||
HealthTimeout: time.Second,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("apply update: %v", err)
|
||||
}
|
||||
if result.Action != "update" || !result.Loaded || !result.Replaced || result.NewImage != "rap-node-agent:test-new" {
|
||||
t.Fatalf("unexpected result: %+v", result)
|
||||
}
|
||||
joined := strings.Join(flattenCalls(runner.calls), "\x00")
|
||||
for _, want := range []string{"inspect\x00rap-node-agent-node-1", "load\x00-i", "rm\x00-f\x00rap-node-agent-node-1", "run\x00-d", "RAP_NODE_NAME=node-a"} {
|
||||
if !strings.Contains(joined, want) {
|
||||
t.Fatalf("missing docker call part %q in %#v", want, runner.calls)
|
||||
}
|
||||
}
|
||||
if len(statuses) != 3 || statuses[0].Phase != "planned" || statuses[1].Phase != "download" || statuses[2].Status != "succeeded" {
|
||||
t.Fatalf("statuses = %+v", statuses)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyUpdatePreservesDockerVPNGatewayRuntime(t *testing.T) {
|
||||
previousStatHostPath := statHostPath
|
||||
statHostPath = func(string) (os.FileInfo, error) { return nil, nil }
|
||||
t.Cleanup(func() { statHostPath = previousStatHostPath })
|
||||
|
||||
artifactBody := []byte("fake docker image tar")
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"node_update_plan": map[string]any{
|
||||
"schema_version": "rap.node_update_plan.v1",
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-1",
|
||||
"product": "rap-node-agent",
|
||||
"current_version": "0.2.7",
|
||||
"action": "update",
|
||||
"reason": "matching_release_available",
|
||||
"target_version": "0.2.8",
|
||||
"rollback_allowed": true,
|
||||
"health_window_seconds": 1,
|
||||
"artifact": map[string]any{
|
||||
"id": "artifact-1",
|
||||
"product": "rap-node-agent",
|
||||
"version": "0.2.8",
|
||||
"os": "linux",
|
||||
"arch": "amd64",
|
||||
"install_type": "docker",
|
||||
"url": serverArtifactURL(r),
|
||||
"sha256": "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
|
||||
"size_bytes": len(artifactBody),
|
||||
"metadata": map[string]any{"image": "rap-node-agent:test-new"},
|
||||
},
|
||||
},
|
||||
})
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
|
||||
case r.Method == http.MethodGet && r.URL.Path == "/artifact.tar":
|
||||
_, _ = w.Write(artifactBody)
|
||||
default:
|
||||
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
runner := &updateRunner{healthOkay: true, inspectJSON: dockerInspectFixtureWithVPNGatewayRuntime()}
|
||||
result, err := (DockerManager{Runner: runner}).ApplyUpdate(context.Background(), UpdateRequest{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-1",
|
||||
CurrentVersion: "0.2.7",
|
||||
ContainerName: "rap-node-agent-node-1",
|
||||
HealthTimeout: time.Second,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("ApplyUpdate failed: %v", err)
|
||||
}
|
||||
if !result.Replaced {
|
||||
t.Fatalf("expected replacement")
|
||||
}
|
||||
joined := strings.Join(flattenCalls(runner.calls), "\x00")
|
||||
for _, want := range []string{"--privileged", "--cap-add\x00NET_ADMIN", "--device\x00/dev/net/tun:/dev/net/tun"} {
|
||||
if !strings.Contains(joined, want) {
|
||||
t.Fatalf("docker run did not preserve %q in %#v", want, runner.calls)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyUpdateNoopsWithoutDockerWhenPlanHasNoAction(t *testing.T) {
|
||||
statuses := []NodeUpdateStatusRequest{}
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"node_update_plan": map[string]any{
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-1",
|
||||
"product": "rap-node-agent",
|
||||
"current_version": "0.1.3",
|
||||
"action": "none",
|
||||
"reason": "already_current",
|
||||
"target_version": "0.1.3",
|
||||
},
|
||||
})
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
|
||||
var status NodeUpdateStatusRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&status); err != nil {
|
||||
t.Fatalf("decode status: %v", err)
|
||||
}
|
||||
statuses = append(statuses, status)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
|
||||
default:
|
||||
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
runner := &updateRunner{}
|
||||
result, err := (DockerManager{Runner: runner}).ApplyUpdate(context.Background(), UpdateRequest{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-1",
|
||||
CurrentVersion: "0.1.3",
|
||||
ContainerName: "rap-node-agent-node-1",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("apply update: %v", err)
|
||||
}
|
||||
if result.Action != "none" || result.Reason != "already_current" {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
if len(runner.calls) != 0 {
|
||||
t.Fatalf("docker should not be called, got %#v", runner.calls)
|
||||
}
|
||||
if len(statuses) != 1 || statuses[0].Phase != "plan" || statuses[0].Status != "noop" || statuses[0].TargetVersion != "0.1.3" {
|
||||
t.Fatalf("statuses = %+v", statuses)
|
||||
}
|
||||
}
|
||||
|
||||
func TestWindowsApplyUpdateNoopReportsTaskStatus(t *testing.T) {
|
||||
statuses := []NodeUpdateStatusRequest{}
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"node_update_plan": map[string]any{
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-1",
|
||||
"product": "rap-node-agent",
|
||||
"current_version": "0.1.3",
|
||||
"action": "none",
|
||||
"reason": "already_current",
|
||||
"target_version": "0.1.3",
|
||||
},
|
||||
})
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
|
||||
var status NodeUpdateStatusRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&status); err != nil {
|
||||
t.Fatalf("decode status: %v", err)
|
||||
}
|
||||
statuses = append(statuses, status)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
|
||||
default:
|
||||
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
result, err := (WindowsManager{Runner: &updateRunner{}}).ApplyUpdate(context.Background(), UpdateRequest{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-1",
|
||||
CurrentVersion: "0.1.3",
|
||||
InstallType: WindowsUpdateInstallType,
|
||||
BinaryPath: `C:\Program Files\RAP\node\rap-node-agent.exe`,
|
||||
WindowsTaskName: "RAP Node Agent node",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("windows apply update: %v", err)
|
||||
}
|
||||
if result.Action != "none" || result.Reason != "already_current" {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
if len(statuses) != 1 || statuses[0].Phase != "plan" || statuses[0].Status != "noop" {
|
||||
t.Fatalf("statuses = %+v", statuses)
|
||||
}
|
||||
if statuses[0].Payload["task"] != "RAP Node Agent node" {
|
||||
t.Fatalf("status payload = %+v", statuses[0].Payload)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunUpdateLoopAdvancesCurrentVersionAfterSuccessfulUpdate(t *testing.T) {
|
||||
artifactBody := []byte("fake docker image tar")
|
||||
planRequests := []string{}
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
|
||||
current := r.URL.Query().Get("current_version")
|
||||
planRequests = append(planRequests, current)
|
||||
action := "update"
|
||||
reason := "matching_release_available"
|
||||
if current == "0.1.0-new" {
|
||||
action = "none"
|
||||
reason = "already_current"
|
||||
}
|
||||
plan := map[string]any{
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-1",
|
||||
"product": "rap-node-agent",
|
||||
"current_version": current,
|
||||
"action": action,
|
||||
"reason": reason,
|
||||
"target_version": "0.1.0-new",
|
||||
"rollback_allowed": true,
|
||||
"production_forwarding": false,
|
||||
}
|
||||
if action == "update" {
|
||||
plan["artifact"] = map[string]any{
|
||||
"id": "artifact-1",
|
||||
"product": "rap-node-agent",
|
||||
"version": "0.1.0-new",
|
||||
"os": "linux",
|
||||
"arch": "amd64",
|
||||
"install_type": "docker",
|
||||
"url": serverArtifactURL(r),
|
||||
"sha256": "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
|
||||
"size_bytes": len(artifactBody),
|
||||
"metadata": map[string]any{"image": "rap-node-agent:test-new"},
|
||||
}
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{"node_update_plan": plan})
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
|
||||
case r.Method == http.MethodGet && r.URL.Path == "/artifact.tar":
|
||||
_, _ = w.Write(artifactBody)
|
||||
default:
|
||||
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
runner := &updateRunner{healthOkay: true, inspectJSON: dockerInspectFixture(server.URL)}
|
||||
err := (DockerManager{Runner: runner}).RunUpdateLoop(context.Background(), UpdateLoopConfig{
|
||||
Request: UpdateRequest{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-1",
|
||||
CurrentVersion: "0.1.0-old",
|
||||
ContainerName: "rap-node-agent-node-1",
|
||||
HealthTimeout: time.Second,
|
||||
},
|
||||
Interval: time.Millisecond,
|
||||
MaxRuns: 2,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("run update loop: %v", err)
|
||||
}
|
||||
if strings.Join(planRequests, ",") != "0.1.0-old,0.1.0-new" {
|
||||
t.Fatalf("plan current versions = %#v", planRequests)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunUpdateLoopReportsHostAgentStatusWhenEnabled(t *testing.T) {
|
||||
statuses := []NodeUpdateStatusRequest{}
|
||||
planProducts := []string{}
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
|
||||
product := r.URL.Query().Get("product")
|
||||
planProducts = append(planProducts, product)
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"node_update_plan": map[string]any{
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-1",
|
||||
"product": product,
|
||||
"current_version": "0.1.3",
|
||||
"action": "none",
|
||||
"reason": "already_current",
|
||||
"target_version": "0.1.3",
|
||||
"rollback_allowed": true,
|
||||
"production_forwarding": false,
|
||||
},
|
||||
})
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
|
||||
var status NodeUpdateStatusRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&status); err != nil {
|
||||
t.Fatalf("decode status: %v", err)
|
||||
}
|
||||
statuses = append(statuses, status)
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
|
||||
default:
|
||||
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
err := (DockerManager{}).RunUpdateLoop(context.Background(), UpdateLoopConfig{
|
||||
Request: UpdateRequest{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
NodeID: "node-1",
|
||||
CurrentVersion: "0.1.3",
|
||||
ContainerName: "rap-node-agent-node-1",
|
||||
},
|
||||
HostAgentUpdateEnabled: true,
|
||||
HostAgentUpdateRequest: HostAgentUpdateRequest{
|
||||
CurrentVersion: "0.1.3",
|
||||
BinaryPath: filepath.Join(t.TempDir(), "rap-host-agent"),
|
||||
},
|
||||
MaxRuns: 1,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("run update loop: %v", err)
|
||||
}
|
||||
if strings.Join(planProducts, ",") != "rap-node-agent,rap-host-agent" {
|
||||
t.Fatalf("plan products = %#v", planProducts)
|
||||
}
|
||||
if len(statuses) != 2 || statuses[0].Product != "rap-node-agent" || statuses[1].Product != "rap-host-agent" {
|
||||
t.Fatalf("statuses = %+v", statuses)
|
||||
}
|
||||
if statuses[1].Phase != "plan" || statuses[1].Status != "noop" {
|
||||
t.Fatalf("host-agent status = %+v", statuses[1])
|
||||
}
|
||||
}
|
||||
|
||||
func TestFetchNodeUpdatePlanResolvesNodeIDAndVersionFromStateDir(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
if err := state.Save(filepath.Join(dir, state.FileName), state.Identity{
|
||||
NodeID: "node-from-state",
|
||||
ClusterID: "cluster-1",
|
||||
NodeName: "node-a",
|
||||
}); err != nil {
|
||||
t.Fatalf("save identity: %v", err)
|
||||
}
|
||||
if err := saveUpdateState(dir, UpdateState{
|
||||
Product: "rap-node-agent",
|
||||
CurrentVersion: "0.1.0-state",
|
||||
}); err != nil {
|
||||
t.Fatalf("save update state: %v", err)
|
||||
}
|
||||
var gotPath string
|
||||
var gotCurrent string
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
gotPath = r.URL.Path
|
||||
gotCurrent = r.URL.Query().Get("current_version")
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"node_update_plan": map[string]any{
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-from-state",
|
||||
"product": "rap-node-agent",
|
||||
"action": "none",
|
||||
"reason": "already_current",
|
||||
},
|
||||
})
|
||||
}))
|
||||
defer server.Close()
|
||||
if _, err := FetchNodeUpdatePlan(context.Background(), UpdateRequest{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
StateDir: dir,
|
||||
CurrentVersion: "0.1.0-flag",
|
||||
}); err != nil {
|
||||
t.Fatalf("fetch plan: %v", err)
|
||||
}
|
||||
if !strings.Contains(gotPath, "/nodes/node-from-state/updates/plan") || gotCurrent != "0.1.0-state" {
|
||||
t.Fatalf("path/current = %q/%q", gotPath, gotCurrent)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyHostAgentUpdateDownloadsAndReplacesBinary(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
if err := state.Save(filepath.Join(dir, state.FileName), state.Identity{
|
||||
NodeID: "node-1",
|
||||
ClusterID: "cluster-1",
|
||||
NodeName: "node-a",
|
||||
}); err != nil {
|
||||
t.Fatalf("save identity: %v", err)
|
||||
}
|
||||
binaryPath := filepath.Join(dir, "rap-host-agent")
|
||||
artifactBody := []byte("new host agent binary")
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
switch {
|
||||
case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "/updates/plan"):
|
||||
if r.URL.Query().Get("product") != HostAgentUpdateProduct || r.URL.Query().Get("install_type") != BinaryUpdateInstallType {
|
||||
t.Fatalf("unexpected query: %s", r.URL.RawQuery)
|
||||
}
|
||||
_ = json.NewEncoder(w).Encode(map[string]any{
|
||||
"node_update_plan": map[string]any{
|
||||
"cluster_id": "cluster-1",
|
||||
"node_id": "node-1",
|
||||
"product": HostAgentUpdateProduct,
|
||||
"action": "update",
|
||||
"reason": "matching_release_available",
|
||||
"target_version": "0.1.0-host-new",
|
||||
"rollback_allowed": false,
|
||||
"production_forwarding": false,
|
||||
"artifact": map[string]any{
|
||||
"id": "artifact-host-1",
|
||||
"product": HostAgentUpdateProduct,
|
||||
"version": "0.1.0-host-new",
|
||||
"os": "linux",
|
||||
"arch": "amd64",
|
||||
"install_type": BinaryUpdateInstallType,
|
||||
"url": serverArtifactURL(r),
|
||||
"sha256": "adc549d9e66ef64a507dd6880590d31309e16a3be965a92d849edd103cfb1815",
|
||||
"size_bytes": len(artifactBody),
|
||||
},
|
||||
},
|
||||
})
|
||||
case r.Method == http.MethodPost && strings.HasSuffix(r.URL.Path, "/updates/status"):
|
||||
w.WriteHeader(http.StatusOK)
|
||||
_, _ = w.Write([]byte(`{"node_update_status":{"id":"status-1"}}`))
|
||||
case r.Method == http.MethodGet && r.URL.Path == "/artifact.tar":
|
||||
_, _ = w.Write(artifactBody)
|
||||
default:
|
||||
t.Fatalf("unexpected request %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
result, err := (DockerManager{}).ApplyHostAgentUpdate(context.Background(), HostAgentUpdateRequest{
|
||||
BackendURL: server.URL,
|
||||
ClusterID: "cluster-1",
|
||||
StateDir: dir,
|
||||
CurrentVersion: "0.1.0-host-old",
|
||||
BinaryPath: binaryPath,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("apply host-agent update: %v", err)
|
||||
}
|
||||
if !result.Replaced || !result.RestartNeeded {
|
||||
t.Fatalf("result = %+v", result)
|
||||
}
|
||||
payload, err := os.ReadFile(binaryPath)
|
||||
if err != nil || string(payload) != string(artifactBody) {
|
||||
t.Fatalf("binary payload = %q, %v", payload, err)
|
||||
}
|
||||
updateState, err := loadUpdateState(dir, HostAgentUpdateProduct)
|
||||
if err != nil {
|
||||
t.Fatalf("load update state: %v", err)
|
||||
}
|
||||
if updateState.Product != HostAgentUpdateProduct || updateState.CurrentVersion != "0.1.0-host-new" {
|
||||
t.Fatalf("update state = %+v", updateState)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateStateIsProductScoped(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
if err := saveUpdateState(dir, UpdateState{Product: DefaultUpdateProduct, CurrentVersion: "node-v"}); err != nil {
|
||||
t.Fatalf("save node state: %v", err)
|
||||
}
|
||||
if err := saveUpdateState(dir, UpdateState{Product: HostAgentUpdateProduct, CurrentVersion: "host-v"}); err != nil {
|
||||
t.Fatalf("save host state: %v", err)
|
||||
}
|
||||
nodeState, err := loadUpdateState(dir, DefaultUpdateProduct)
|
||||
if err != nil {
|
||||
t.Fatalf("load node state: %v", err)
|
||||
}
|
||||
hostState, err := loadUpdateState(dir, HostAgentUpdateProduct)
|
||||
if err != nil {
|
||||
t.Fatalf("load host state: %v", err)
|
||||
}
|
||||
if nodeState.CurrentVersion != "node-v" || hostState.CurrentVersion != "host-v" {
|
||||
t.Fatalf("states overlapped: node=%+v host=%+v", nodeState, hostState)
|
||||
}
|
||||
}
|
||||
|
||||
func TestArtifactImageDerivesDockerTagFromProductAndVersion(t *testing.T) {
|
||||
got := artifactImage(ReleaseArtifact{
|
||||
Product: "rap-node-agent",
|
||||
Version: "0.2.77",
|
||||
InstallType: DefaultUpdateInstallType,
|
||||
}, "rap-node-agent:old")
|
||||
if got != "rap-node-agent:0.2.77" {
|
||||
t.Fatalf("expected versioned docker image, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func serverArtifactURL(r *http.Request) string {
|
||||
scheme := "http"
|
||||
if r.TLS != nil {
|
||||
scheme = "https"
|
||||
}
|
||||
return fmt.Sprintf("%s://%s/artifact.tar", scheme, r.Host)
|
||||
}
|
||||
|
||||
func dockerInspectFixture(_ string) string {
|
||||
return `[
|
||||
{
|
||||
"Id": "old-container",
|
||||
"Image": "sha256:oldimage",
|
||||
"Config": {
|
||||
"Image": "rap-node-agent:test-old",
|
||||
"Env": [
|
||||
"RAP_BACKEND_URL=http://control/api/v1",
|
||||
"RAP_CLUSTER_ID=cluster-1",
|
||||
"RAP_NODE_NAME=node-a",
|
||||
"RAP_NODE_STATE_DIR=/var/lib/rap-node-agent",
|
||||
"RAP_HEARTBEAT_INTERVAL_SECONDS=15",
|
||||
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS=5",
|
||||
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=0",
|
||||
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true",
|
||||
"RAP_MESH_LISTEN_ADDR=:19131"
|
||||
]
|
||||
},
|
||||
"HostConfig": {
|
||||
"NetworkMode": "host",
|
||||
"RestartPolicy": {"Name": "unless-stopped"}
|
||||
},
|
||||
"Mounts": [
|
||||
{"Source": "/var/lib/rap/nodes/node-a", "Destination": "/var/lib/rap-node-agent"}
|
||||
],
|
||||
"State": {"Running": true}
|
||||
}
|
||||
]`
|
||||
}
|
||||
|
||||
func dockerInspectFixtureWithVPNGatewayRuntime() string {
|
||||
return `[
|
||||
{
|
||||
"Id": "old-container",
|
||||
"Image": "sha256:oldimage",
|
||||
"Config": {
|
||||
"Image": "rap-node-agent:test-old",
|
||||
"Env": [
|
||||
"RAP_BACKEND_URL=http://control/api/v1",
|
||||
"RAP_CLUSTER_ID=cluster-1",
|
||||
"RAP_NODE_NAME=node-a",
|
||||
"RAP_NODE_STATE_DIR=/var/lib/rap-node-agent",
|
||||
"RAP_HEARTBEAT_INTERVAL_SECONDS=15",
|
||||
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS=5",
|
||||
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=0",
|
||||
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true",
|
||||
"RAP_MESH_LISTEN_ADDR=:19131"
|
||||
]
|
||||
},
|
||||
"HostConfig": {
|
||||
"NetworkMode": "host",
|
||||
"Privileged": true,
|
||||
"CapAdd": ["NET_ADMIN"],
|
||||
"Devices": [
|
||||
{"PathOnHost": "/dev/net/tun", "PathInContainer": "/dev/net/tun", "CgroupPermissions": "rwm"}
|
||||
],
|
||||
"RestartPolicy": {"Name": "unless-stopped"}
|
||||
},
|
||||
"Mounts": [
|
||||
{"Source": "/var/lib/rap/nodes/node-a", "Destination": "/var/lib/rap-node-agent"}
|
||||
],
|
||||
"State": {"Running": true}
|
||||
}
|
||||
]`
|
||||
}
|
||||
@@ -0,0 +1,368 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultWindowsInstallDir = `C:\Program Files\RAP`
|
||||
DefaultWindowsStateRoot = `C:\ProgramData\RAP\nodes`
|
||||
)
|
||||
|
||||
type WindowsInstallConfig struct {
|
||||
RuntimeConfig RuntimeConfig
|
||||
NodeID string
|
||||
InstallDir string
|
||||
StartupMode string
|
||||
ArtifactURLs []string
|
||||
ArtifactSHA256 string
|
||||
ArtifactSizeBytes int64
|
||||
Replace bool
|
||||
DryRun bool
|
||||
AutoUpdateEnabled bool
|
||||
AutoUpdateCurrentVersion string
|
||||
AutoUpdateChannel string
|
||||
AutoUpdateIntervalSeconds int
|
||||
AutoUpdateInitialDelaySeconds int
|
||||
AutoUpdateHealthTimeoutSeconds int
|
||||
HostAgentSourcePath string
|
||||
}
|
||||
|
||||
type WindowsInstallResult struct {
|
||||
NodeName string
|
||||
InstallDir string
|
||||
StateDir string
|
||||
NodeAgentPath string
|
||||
WrapperPath string
|
||||
StartupMode string
|
||||
TaskName string
|
||||
HostAgentPath string
|
||||
UpdaterTaskName string
|
||||
Downloaded bool
|
||||
Started bool
|
||||
UpdaterStarted bool
|
||||
AdminFallback bool
|
||||
}
|
||||
|
||||
type WindowsManager struct {
|
||||
Runner CommandRunner
|
||||
}
|
||||
|
||||
func WindowsInstallConfigFromProfile(profile WindowsInstallProfile) WindowsInstallConfig {
|
||||
stateDir := firstNonEmpty(profile.StateDir, filepath.Join(DefaultWindowsStateRoot, safeUnitSlug(profile.NodeName)))
|
||||
return WindowsInstallConfig{
|
||||
RuntimeConfig: RuntimeConfig{
|
||||
BackendURL: profile.BackendURL,
|
||||
ClusterID: profile.ClusterID,
|
||||
JoinToken: profile.JoinToken,
|
||||
NodeName: profile.NodeName,
|
||||
StateDir: stateDir,
|
||||
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
|
||||
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
|
||||
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
|
||||
MeshListenAddr: profile.MeshListenAddr,
|
||||
MeshListenPortMode: profile.MeshListenPortMode,
|
||||
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
|
||||
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
|
||||
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
|
||||
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
|
||||
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
|
||||
MeshConnectivityMode: profile.MeshConnectivityMode,
|
||||
MeshNATType: profile.MeshNATType,
|
||||
MeshRegion: profile.MeshRegion,
|
||||
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
|
||||
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
|
||||
EnrollmentPollTimeoutSeconds: profile.EnrollmentPollTimeoutSeconds,
|
||||
ProductionObservationSinkCap: profile.ProductionObservationSinkCapacity,
|
||||
},
|
||||
InstallDir: firstNonEmpty(profile.InstallDir, filepath.Join(DefaultWindowsInstallDir, safeUnitSlug(profile.NodeName))),
|
||||
StartupMode: firstNonEmpty(profile.StartupMode, "auto"),
|
||||
ArtifactURLs: binaryArtifactURLs(profile),
|
||||
ArtifactSHA256: binaryArtifactSHA256(profile),
|
||||
ArtifactSizeBytes: binaryArtifactSizeBytes(profile),
|
||||
Replace: true,
|
||||
AutoUpdateEnabled: true,
|
||||
}
|
||||
}
|
||||
|
||||
func (m WindowsManager) Install(ctx context.Context, cfg WindowsInstallConfig) (WindowsInstallResult, error) {
|
||||
cfg.NodeID = strings.TrimSpace(cfg.NodeID)
|
||||
if strings.TrimSpace(cfg.RuntimeConfig.StateDir) == "" {
|
||||
cfg.RuntimeConfig.StateDir = filepath.Join(DefaultWindowsStateRoot, safeUnitSlug(cfg.RuntimeConfig.NodeName))
|
||||
}
|
||||
cfg.RuntimeConfig.Replace = cfg.Replace
|
||||
cfg.RuntimeConfig = cfg.RuntimeConfig.Normalize()
|
||||
if err := cfg.RuntimeConfig.ValidateInstall(); err != nil {
|
||||
return WindowsInstallResult{}, err
|
||||
}
|
||||
cfg.StartupMode = strings.ToLower(firstNonEmpty(cfg.StartupMode, "auto"))
|
||||
noAdminPreferred := cfg.StartupMode == "user-task"
|
||||
cfg.InstallDir = firstNonEmpty(cfg.InstallDir, defaultWindowsInstallDir(cfg.RuntimeConfig.NodeName, noAdminPreferred))
|
||||
cfg.StartupMode = strings.ToLower(firstNonEmpty(cfg.StartupMode, "auto"))
|
||||
if noAdminPreferred && strings.HasPrefix(strings.ToLower(cfg.RuntimeConfig.StateDir), strings.ToLower(DefaultWindowsStateRoot)) {
|
||||
cfg.RuntimeConfig.StateDir = defaultWindowsStateDir(cfg.RuntimeConfig.NodeName, true)
|
||||
}
|
||||
result := WindowsInstallResult{
|
||||
NodeName: cfg.RuntimeConfig.NodeName,
|
||||
InstallDir: cfg.InstallDir,
|
||||
StateDir: cfg.RuntimeConfig.StateDir,
|
||||
NodeAgentPath: filepath.Join(cfg.InstallDir, "rap-node-agent.exe"),
|
||||
WrapperPath: filepath.Join(cfg.InstallDir, "rap-node-agent-run.cmd"),
|
||||
StartupMode: cfg.StartupMode,
|
||||
TaskName: "RAP Node Agent " + safeUnitSlug(cfg.RuntimeConfig.NodeName),
|
||||
}
|
||||
if cfg.DryRun {
|
||||
return result, nil
|
||||
}
|
||||
if runtime.GOOS != "windows" {
|
||||
return result, fmt.Errorf("windows install is only supported on windows hosts")
|
||||
}
|
||||
if err := os.MkdirAll(cfg.InstallDir, 0o755); err != nil {
|
||||
if cfg.StartupMode != "auto" || !isAccessDenied(err) {
|
||||
return result, err
|
||||
}
|
||||
cfg.InstallDir = defaultWindowsInstallDir(cfg.RuntimeConfig.NodeName, true)
|
||||
cfg.RuntimeConfig.StateDir = defaultWindowsStateDir(cfg.RuntimeConfig.NodeName, true)
|
||||
result.InstallDir = cfg.InstallDir
|
||||
result.StateDir = cfg.RuntimeConfig.StateDir
|
||||
result.NodeAgentPath = filepath.Join(cfg.InstallDir, "rap-node-agent.exe")
|
||||
result.WrapperPath = filepath.Join(cfg.InstallDir, "rap-node-agent-run.cmd")
|
||||
if err := os.MkdirAll(cfg.InstallDir, 0o755); err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.AdminFallback = true
|
||||
}
|
||||
if err := os.MkdirAll(cfg.RuntimeConfig.StateDir, 0o700); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if len(cfg.ArtifactURLs) > 0 && (cfg.Replace || !fileExists(result.NodeAgentPath)) {
|
||||
m.stopExistingNodeAgent(ctx, result.TaskName, result.NodeAgentPath)
|
||||
path, err := downloadFirstArtifact(ctx, cfg.ArtifactURLs, cfg.ArtifactSHA256, cfg.ArtifactSizeBytes)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
defer os.Remove(path)
|
||||
if err := copyFile(path, result.NodeAgentPath, 0o755); err != nil {
|
||||
m.stopExistingNodeAgent(ctx, result.TaskName, result.NodeAgentPath)
|
||||
if retryErr := copyFile(path, result.NodeAgentPath, 0o755); retryErr == nil {
|
||||
result.Downloaded = true
|
||||
goto binaryReady
|
||||
}
|
||||
return result, err
|
||||
}
|
||||
result.Downloaded = true
|
||||
}
|
||||
binaryReady:
|
||||
if !fileExists(result.NodeAgentPath) {
|
||||
return result, fmt.Errorf("node-agent binary is missing at %s and no artifact was available", result.NodeAgentPath)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(cfg.InstallDir, "rap-node-agent.env.cmd"), []byte(windowsEnvScript(cfg.RuntimeConfig)), 0o600); err != nil {
|
||||
return result, err
|
||||
}
|
||||
if err := os.WriteFile(result.WrapperPath, []byte(windowsWrapperScript(result.NodeAgentPath, filepath.Join(cfg.InstallDir, "rap-node-agent.env.cmd"))), 0o755); err != nil {
|
||||
return result, err
|
||||
}
|
||||
logPath := filepath.Join(cfg.RuntimeConfig.StateDir, "rap-node-agent.log")
|
||||
started, fallback, mode, err := m.installStartupTask(ctx, result.TaskName, result.WrapperPath, logPath, cfg.StartupMode)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.Started = started
|
||||
result.AdminFallback = fallback
|
||||
result.StartupMode = mode
|
||||
result, err = installWindowsHostAgentUpdater(ctx, m, result, cfg)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m WindowsManager) stopExistingNodeAgent(ctx context.Context, taskName, nodeAgentPath string) {
|
||||
runner := m.Runner
|
||||
if runner == nil {
|
||||
runner = ExecRunner{}
|
||||
}
|
||||
_, _ = runner.Run(ctx, "schtasks", "/End", "/TN", taskName)
|
||||
escapedPath := strings.ReplaceAll(nodeAgentPath, `'`, `''`)
|
||||
_, _ = runner.Run(ctx, "powershell", "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command",
|
||||
`Get-Process rap-node-agent -ErrorAction SilentlyContinue | Where-Object { $_.Path -eq '`+escapedPath+`' } | Stop-Process -Force -ErrorAction SilentlyContinue`)
|
||||
}
|
||||
|
||||
func (m WindowsManager) installStartupTask(ctx context.Context, taskName, wrapperPath, logPath, mode string) (bool, bool, string, error) {
|
||||
if mode == "none" {
|
||||
return false, false, mode, nil
|
||||
}
|
||||
runner := m.Runner
|
||||
if runner == nil {
|
||||
runner = ExecRunner{}
|
||||
}
|
||||
if mode == "auto" || mode == "system-task" {
|
||||
_, err := runner.Run(ctx, "schtasks", "/Create", "/TN", taskName, "/SC", "ONSTART", "/RU", "SYSTEM", "/RL", "HIGHEST", "/TR", windowsTaskAction(wrapperPath, logPath), "/F")
|
||||
if err == nil {
|
||||
_, _ = runner.Run(ctx, "schtasks", "/Run", "/TN", taskName)
|
||||
return true, false, "system-task", nil
|
||||
}
|
||||
if mode == "system-task" {
|
||||
return false, false, mode, err
|
||||
}
|
||||
}
|
||||
_, err := runner.Run(ctx, "schtasks", "/Create", "/TN", taskName, "/SC", "ONLOGON", "/TR", windowsTaskAction(wrapperPath, logPath), "/F")
|
||||
if err != nil {
|
||||
return false, mode == "auto", "user-task", err
|
||||
}
|
||||
_, _ = runner.Run(ctx, "schtasks", "/Run", "/TN", taskName)
|
||||
return true, mode == "auto", "user-task", nil
|
||||
}
|
||||
|
||||
func windowsTaskAction(wrapperPath, logPath string) string {
|
||||
return `cmd.exe /c ""` + wrapperPath + `" >> "` + logPath + `" 2>&1"`
|
||||
}
|
||||
|
||||
func windowsEnvScript(cfg RuntimeConfig) string {
|
||||
lines := []string{"@echo off"}
|
||||
for _, env := range NodeAgentEnv(cfg) {
|
||||
key, value, ok := strings.Cut(env, "=")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
lines = append(lines, "set "+key+"="+value)
|
||||
}
|
||||
return strings.Join(lines, "\r\n") + "\r\n"
|
||||
}
|
||||
|
||||
func windowsWrapperScript(nodeAgentPath, envPath string) string {
|
||||
return strings.Join([]string{
|
||||
"@echo off",
|
||||
`call "` + envPath + `"`,
|
||||
`"` + nodeAgentPath + `"`,
|
||||
}, "\r\n") + "\r\n"
|
||||
}
|
||||
|
||||
func binaryArtifactURLs(profile WindowsInstallProfile) []string {
|
||||
if profile.NodeAgentArtifact != nil && len(profile.NodeAgentArtifact.URLs) > 0 {
|
||||
return append([]string(nil), profile.NodeAgentArtifact.URLs...)
|
||||
}
|
||||
if profile.NodeAgentArtifact == nil || strings.TrimSpace(profile.NodeAgentArtifact.FileName) == "" {
|
||||
return nil
|
||||
}
|
||||
out := []string{}
|
||||
fileName := strings.TrimLeft(strings.TrimSpace(profile.NodeAgentArtifact.FileName), "/")
|
||||
for _, endpoint := range profile.ArtifactEndpoints {
|
||||
if trimmed := strings.TrimRight(strings.TrimSpace(endpoint), "/"); trimmed != "" {
|
||||
out = append(out, trimmed+"/"+fileName)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func binaryArtifactSHA256(profile WindowsInstallProfile) string {
|
||||
if profile.NodeAgentArtifact == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(profile.NodeAgentArtifact.SHA256)
|
||||
}
|
||||
|
||||
func binaryArtifactSizeBytes(profile WindowsInstallProfile) int64 {
|
||||
if profile.NodeAgentArtifact == nil {
|
||||
return 0
|
||||
}
|
||||
return profile.NodeAgentArtifact.SizeBytes
|
||||
}
|
||||
|
||||
func fileExists(path string) bool {
|
||||
_, err := os.Stat(path)
|
||||
return err == nil
|
||||
}
|
||||
|
||||
func copyFile(source, target string, mode os.FileMode) error {
|
||||
src, err := os.Open(source)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer src.Close()
|
||||
if err := os.MkdirAll(filepath.Dir(target), 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
tmp := target + ".tmp"
|
||||
dst, err := os.OpenFile(tmp, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, mode)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := io.Copy(dst, src); err != nil {
|
||||
_ = dst.Close()
|
||||
_ = os.Remove(tmp)
|
||||
return err
|
||||
}
|
||||
if err := dst.Close(); err != nil {
|
||||
_ = os.Remove(tmp)
|
||||
return err
|
||||
}
|
||||
if err := replaceFile(tmp, target); err != nil {
|
||||
_ = os.Remove(tmp)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func replaceFile(tmp, target string) error {
|
||||
if runtime.GOOS != "windows" {
|
||||
return os.Rename(tmp, target)
|
||||
}
|
||||
backup := target + ".bak"
|
||||
_ = os.Remove(backup)
|
||||
if fileExists(target) {
|
||||
if err := os.Rename(target, backup); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := os.Rename(tmp, target); err != nil {
|
||||
if fileExists(backup) {
|
||||
_ = os.Rename(backup, target)
|
||||
}
|
||||
return err
|
||||
}
|
||||
_ = os.Remove(backup)
|
||||
return nil
|
||||
}
|
||||
|
||||
func defaultWindowsInstallDir(nodeName string, userMode bool) string {
|
||||
slug := safeUnitSlug(nodeName)
|
||||
if userMode {
|
||||
if base := strings.TrimSpace(os.Getenv("LOCALAPPDATA")); base != "" {
|
||||
return filepath.Join(base, "RAP", slug)
|
||||
}
|
||||
if base := strings.TrimSpace(os.Getenv("USERPROFILE")); base != "" {
|
||||
return filepath.Join(base, "AppData", "Local", "RAP", slug)
|
||||
}
|
||||
}
|
||||
return filepath.Join(DefaultWindowsInstallDir, slug)
|
||||
}
|
||||
|
||||
func defaultWindowsStateDir(nodeName string, userMode bool) string {
|
||||
slug := safeUnitSlug(nodeName)
|
||||
if userMode {
|
||||
if base := strings.TrimSpace(os.Getenv("LOCALAPPDATA")); base != "" {
|
||||
return filepath.Join(base, "RAP", "nodes", slug)
|
||||
}
|
||||
if base := strings.TrimSpace(os.Getenv("USERPROFILE")); base != "" {
|
||||
return filepath.Join(base, "AppData", "Local", "RAP", "nodes", slug)
|
||||
}
|
||||
}
|
||||
return filepath.Join(DefaultWindowsStateRoot, slug)
|
||||
}
|
||||
|
||||
func isAccessDenied(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
value := strings.ToLower(err.Error())
|
||||
return strings.Contains(value, "access is denied") ||
|
||||
strings.Contains(value, "permission denied") ||
|
||||
strings.Contains(value, "operation not permitted")
|
||||
}
|
||||
@@ -0,0 +1,337 @@
|
||||
package hostagent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (UpdateResult, error) {
|
||||
if strings.TrimSpace(req.InstallType) == "" || req.InstallType == DefaultUpdateInstallType {
|
||||
req.InstallType = WindowsUpdateInstallType
|
||||
}
|
||||
req.OS = firstNonEmpty(req.OS, "windows")
|
||||
req.Arch = firstNonEmpty(req.Arch, "amd64")
|
||||
req = req.Normalize()
|
||||
var err error
|
||||
req, err = resolveUpdateRequest(req)
|
||||
if err != nil {
|
||||
return UpdateResult{}, err
|
||||
}
|
||||
runner := m.Runner
|
||||
if runner == nil {
|
||||
runner = ExecRunner{}
|
||||
}
|
||||
plan, err := FetchNodeUpdatePlan(ctx, req)
|
||||
if err != nil {
|
||||
return UpdateResult{}, err
|
||||
}
|
||||
if plan.HealthWindowSec > 0 && req.HealthTimeout == 30*time.Second {
|
||||
req.HealthTimeout = time.Duration(plan.HealthWindowSec) * time.Second
|
||||
}
|
||||
result := UpdateResult{
|
||||
Action: plan.Action,
|
||||
Reason: plan.Reason,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
ContainerName: req.WindowsTaskName,
|
||||
NewImage: req.BinaryPath,
|
||||
}
|
||||
if plan.Action != "update" {
|
||||
if !req.DryRun {
|
||||
status := statusFromNoopPlan(req, plan)
|
||||
if status.Payload == nil {
|
||||
status.Payload = map[string]any{}
|
||||
}
|
||||
status.Payload["task"] = req.WindowsTaskName
|
||||
status.Payload["binary_path"] = req.BinaryPath
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, status)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
if plan.ProductionForwarding && !req.AllowProductionMesh {
|
||||
err := errors.New("refusing update plan with production forwarding enabled")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact == nil {
|
||||
err := errors.New("update plan has no artifact")
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if plan.Artifact.InstallType != "" && plan.Artifact.InstallType != WindowsUpdateInstallType {
|
||||
err := fmt.Errorf("unsupported update artifact install type %q", plan.Artifact.InstallType)
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "preflight", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
if req.DryRun {
|
||||
return result, nil
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "planned",
|
||||
Status: "accepted",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"strategy": plan.Strategy, "reason": plan.Reason, "task": req.WindowsTaskName},
|
||||
})
|
||||
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "download",
|
||||
Status: "started",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath},
|
||||
})
|
||||
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
|
||||
if err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "download", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
defer os.Remove(path)
|
||||
m.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
|
||||
if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
|
||||
m.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
|
||||
if retryErr := copyFile(path, req.BinaryPath, 0o755); retryErr != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "apply", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
}
|
||||
result.Replaced = true
|
||||
if _, err := runner.Run(ctx, "schtasks", "/Run", "/TN", req.WindowsTaskName); err != nil {
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, statusFromError(req, plan, "restart", "failed", err))
|
||||
return result, err
|
||||
}
|
||||
_ = ReportNodeUpdateStatus(ctx, req.BackendURL, req.ClusterID, req.NodeID, NodeUpdateStatusRequest{
|
||||
Product: req.Product,
|
||||
CurrentVersion: req.CurrentVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Phase: "health_check",
|
||||
Status: "succeeded",
|
||||
AttemptID: updateAttemptID(plan),
|
||||
ObservedAt: time.Now().UTC(),
|
||||
Payload: map[string]any{"task": req.WindowsTaskName, "binary_path": req.BinaryPath},
|
||||
})
|
||||
_ = saveUpdateState(req.StateDir, UpdateState{
|
||||
Product: req.Product,
|
||||
CurrentVersion: plan.TargetVersion,
|
||||
TargetVersion: plan.TargetVersion,
|
||||
Image: req.BinaryPath,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m WindowsManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) error {
|
||||
req := cfg.Request
|
||||
if strings.TrimSpace(req.InstallType) == "" || req.InstallType == DefaultUpdateInstallType {
|
||||
req.InstallType = WindowsUpdateInstallType
|
||||
}
|
||||
req.OS = firstNonEmpty(req.OS, "windows")
|
||||
req.Arch = firstNonEmpty(req.Arch, "amd64")
|
||||
req = req.Normalize()
|
||||
if err := req.Validate(); err != nil {
|
||||
return err
|
||||
}
|
||||
if cfg.Interval == 0 {
|
||||
cfg.Interval = time.Hour
|
||||
}
|
||||
if cfg.Interval < 0 {
|
||||
return errors.New("update loop interval must not be negative")
|
||||
}
|
||||
if cfg.InitialDelay < 0 {
|
||||
return errors.New("update loop initial delay must not be negative")
|
||||
}
|
||||
if cfg.Jitter < 0 || cfg.Jitter > 1 {
|
||||
return errors.New("update loop jitter must be between 0 and 1")
|
||||
}
|
||||
logf := cfg.Logf
|
||||
if logf == nil {
|
||||
logf = func(string, ...any) {}
|
||||
}
|
||||
if cfg.InitialDelay > 0 {
|
||||
if err := sleepContext(ctx, jitteredDuration(cfg.InitialDelay, cfg.Jitter)); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
runs := 0
|
||||
lastTriggerGeneration := currentUpdateTriggerGeneration(req.StateDir)
|
||||
for {
|
||||
runs++
|
||||
result, err := m.ApplyUpdate(ctx, req)
|
||||
if err != nil {
|
||||
if errors.Is(err, ErrNodeIdentityNotReady) {
|
||||
logf("windows_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, req.StateDir)
|
||||
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
|
||||
return nil
|
||||
}
|
||||
if err := sleepContext(ctx, jitteredDuration(cfg.Interval, cfg.Jitter)); err != nil {
|
||||
return err
|
||||
}
|
||||
continue
|
||||
}
|
||||
logf("windows_update_loop run=%d status=failed error=%v", runs, err)
|
||||
if cfg.StopOnError {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
logf("windows_update_loop run=%d action=%s reason=%s target=%s task=%s replaced=%t",
|
||||
runs,
|
||||
result.Action,
|
||||
result.Reason,
|
||||
result.TargetVersion,
|
||||
result.ContainerName,
|
||||
result.Replaced,
|
||||
)
|
||||
if result.Action == "update" && result.TargetVersion != "" && !result.RolledBack {
|
||||
req.CurrentVersion = result.TargetVersion
|
||||
}
|
||||
}
|
||||
if cfg.HostAgentUpdateEnabled {
|
||||
hostReq := cfg.HostAgentUpdateRequest
|
||||
hostReq.BackendURL = firstNonEmpty(hostReq.BackendURL, req.BackendURL)
|
||||
hostReq.ClusterID = firstNonEmpty(hostReq.ClusterID, req.ClusterID)
|
||||
hostReq.NodeID = firstNonEmpty(hostReq.NodeID, req.NodeID)
|
||||
hostReq.StateDir = firstNonEmpty(hostReq.StateDir, req.StateDir)
|
||||
hostReq.Channel = firstNonEmpty(hostReq.Channel, req.Channel)
|
||||
hostReq.OS = firstNonEmpty(hostReq.OS, "windows")
|
||||
hostReq.Arch = firstNonEmpty(hostReq.Arch, "amd64")
|
||||
hostReq.InstallType = firstNonEmpty(hostReq.InstallType, "windows_binary")
|
||||
hostResult, hostErr := (DockerManager{}).ApplyHostAgentUpdate(ctx, hostReq)
|
||||
if hostErr != nil {
|
||||
if errors.Is(hostErr, ErrNodeIdentityNotReady) {
|
||||
logf("windows_host_agent_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, hostReq.StateDir)
|
||||
} else {
|
||||
logf("windows_host_agent_update_loop run=%d status=failed error=%v", runs, hostErr)
|
||||
if cfg.StopOnError {
|
||||
return hostErr
|
||||
}
|
||||
}
|
||||
} else {
|
||||
logf("windows_host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t",
|
||||
runs,
|
||||
hostResult.Action,
|
||||
hostResult.Reason,
|
||||
hostResult.TargetVersion,
|
||||
hostResult.NewImage,
|
||||
hostResult.Replaced,
|
||||
hostResult.RestartNeeded,
|
||||
)
|
||||
if hostResult.Action == "update" && hostResult.TargetVersion != "" && !hostResult.RolledBack {
|
||||
cfg.HostAgentUpdateRequest.CurrentVersion = hostResult.TargetVersion
|
||||
}
|
||||
}
|
||||
}
|
||||
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
|
||||
return nil
|
||||
}
|
||||
if err := sleepUntilUpdateIntervalOrTrigger(ctx, req.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func installWindowsHostAgentUpdater(ctx context.Context, m WindowsManager, result WindowsInstallResult, cfg WindowsInstallConfig) (WindowsInstallResult, error) {
|
||||
if !cfg.AutoUpdateEnabled || strings.EqualFold(result.StartupMode, "none") {
|
||||
return result, nil
|
||||
}
|
||||
if cfg.AutoUpdateCurrentVersion == "" || (cfg.Replace && !result.Downloaded) {
|
||||
cfg.AutoUpdateCurrentVersion = "0.0.0"
|
||||
}
|
||||
hostAgentPath := filepath.Join(result.InstallDir, "rap-host-agent.exe")
|
||||
if err := installHostAgentBinary(cfg.HostAgentSourcePath, hostAgentPath); err != nil {
|
||||
return result, err
|
||||
}
|
||||
wrapperPath := filepath.Join(result.InstallDir, "rap-host-agent-update.cmd")
|
||||
logPath := filepath.Join(result.StateDir, "rap-host-agent-update.log")
|
||||
taskName := "RAP Host Agent Updater " + safeUnitSlug(result.NodeName)
|
||||
script := windowsHostAgentUpdateScript(hostAgentPath, cfg, result)
|
||||
if err := os.WriteFile(wrapperPath, []byte(script), 0o755); err != nil {
|
||||
return result, err
|
||||
}
|
||||
started, fallback, mode, err := m.installStartupTask(ctx, taskName, wrapperPath, logPath, cfg.StartupMode)
|
||||
if err != nil {
|
||||
return result, err
|
||||
}
|
||||
result.HostAgentPath = hostAgentPath
|
||||
result.UpdaterTaskName = taskName
|
||||
result.UpdaterStarted = started
|
||||
if fallback {
|
||||
result.AdminFallback = true
|
||||
}
|
||||
if mode != "" && mode != result.StartupMode {
|
||||
result.StartupMode = mode
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig, result WindowsInstallResult) string {
|
||||
currentVersion := firstNonEmpty(cfg.AutoUpdateCurrentVersion, "0.0.0")
|
||||
interval := cfg.AutoUpdateIntervalSeconds
|
||||
if interval == 0 {
|
||||
interval = 21600
|
||||
}
|
||||
initialDelay := cfg.AutoUpdateInitialDelaySeconds
|
||||
if initialDelay == 0 {
|
||||
initialDelay = 15
|
||||
}
|
||||
healthTimeout := cfg.AutoUpdateHealthTimeoutSeconds
|
||||
if healthTimeout == 0 {
|
||||
healthTimeout = 30
|
||||
}
|
||||
updateLoopArgs := []string{
|
||||
`"` + hostAgentPath + `"`,
|
||||
"update-loop",
|
||||
"--backend-url", `"` + cfg.RuntimeConfig.BackendURL + `"`,
|
||||
"--cluster-id", `"` + cfg.RuntimeConfig.ClusterID + `"`,
|
||||
"--state-dir", `"` + result.StateDir + `"`,
|
||||
"--current-version", currentVersion,
|
||||
"--os", "windows",
|
||||
"--arch", "amd64",
|
||||
"--install-type", WindowsUpdateInstallType,
|
||||
"--binary-path", `"` + result.NodeAgentPath + `"`,
|
||||
"--windows-task-name", `"` + result.TaskName + `"`,
|
||||
"--health-timeout-seconds", fmt.Sprintf("%d", healthTimeout),
|
||||
"--interval-seconds", fmt.Sprintf("%d", interval),
|
||||
"--initial-delay-seconds", "0",
|
||||
"--host-agent-update-status-enabled",
|
||||
"--host-agent-current-version", currentVersion,
|
||||
"--host-agent-binary-path", `"` + hostAgentPath + `"`,
|
||||
}
|
||||
if strings.TrimSpace(cfg.NodeID) != "" {
|
||||
updateLoopArgs = append(updateLoopArgs, "--node-id", `"`+strings.TrimSpace(cfg.NodeID)+`"`)
|
||||
}
|
||||
if strings.TrimSpace(cfg.AutoUpdateChannel) != "" {
|
||||
updateLoopArgs = append(updateLoopArgs, "--channel", strings.TrimSpace(cfg.AutoUpdateChannel))
|
||||
}
|
||||
lines := []string{
|
||||
"@echo off",
|
||||
"setlocal",
|
||||
"set RAP_HOST_AGENT=" + `"` + hostAgentPath + `"`,
|
||||
"set RAP_HOST_AGENT_NEXT=" + `"` + hostAgentPath + `.next"`,
|
||||
}
|
||||
if initialDelay > 0 {
|
||||
lines = append(lines, "timeout /t "+fmt.Sprintf("%d", initialDelay)+" /nobreak >NUL")
|
||||
}
|
||||
lines = append(lines, []string{
|
||||
":loop",
|
||||
"if exist %RAP_HOST_AGENT_NEXT% (",
|
||||
" copy /Y %RAP_HOST_AGENT_NEXT% %RAP_HOST_AGENT% >NUL",
|
||||
" if %ERRORLEVEL% EQU 0 del /F /Q %RAP_HOST_AGENT_NEXT%",
|
||||
")",
|
||||
strings.Join(updateLoopArgs, " "),
|
||||
"timeout /t " + fmt.Sprintf("%d", interval) + " /nobreak >NUL",
|
||||
"goto loop",
|
||||
"endlocal",
|
||||
"rem initial-delay-seconds " + fmt.Sprintf("%d", initialDelay),
|
||||
}...)
|
||||
return strings.Join(lines, "\r\n") + "\r\n"
|
||||
}
|
||||
@@ -63,10 +63,12 @@ const (
|
||||
ProductionChannelVPNPacket = "vpn_packet"
|
||||
ProductionMessageVPNPacketBatch = "vpn.packet_batch"
|
||||
FabricServiceClassVPNPackets = "vpn_packets"
|
||||
FabricServiceClassRemoteWorkspace = "remote_workspace"
|
||||
FabricServiceChannelBulk = "bulk"
|
||||
FabricServiceChannelControl = "control"
|
||||
FabricServiceChannelInteractive = "interactive"
|
||||
FabricServiceChannelReliable = "reliable"
|
||||
FabricServiceChannelDroppable = "droppable"
|
||||
MaxProductionEnvelopePayloadBytes = 4096
|
||||
MaxProductionVPNPacketPayloadBytes = 256 * 1024
|
||||
MaxProductionEnvelopeFutureSkew = time.Minute
|
||||
|
||||
@@ -59,9 +59,9 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
|
||||
reasons := []string{"base"}
|
||||
|
||||
switch candidate.Transport {
|
||||
case "direct_tcp_tls":
|
||||
case "direct_tcp_tls", "direct_http", "direct_https":
|
||||
score += 35
|
||||
reasons = append(reasons, "transport:direct_tcp_tls")
|
||||
reasons = append(reasons, "transport:direct")
|
||||
case "wss":
|
||||
score += 25
|
||||
reasons = append(reasons, "transport:wss")
|
||||
|
||||
@@ -37,27 +37,28 @@ type PeerCacheSnapshot struct {
|
||||
}
|
||||
|
||||
type PeerCacheEntry struct {
|
||||
NodeID string `json:"node_id"`
|
||||
RouteIDs []string `json:"route_ids,omitempty"`
|
||||
Endpoint string `json:"endpoint,omitempty"`
|
||||
EndpointCount int `json:"endpoint_count"`
|
||||
CandidateCount int `json:"candidate_count"`
|
||||
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
|
||||
RecoverySeed bool `json:"recovery_seed"`
|
||||
Warm bool `json:"warm"`
|
||||
WarmReason string `json:"warm_reason,omitempty"`
|
||||
BestCandidateID string `json:"best_candidate_id,omitempty"`
|
||||
BestCandidateAddr string `json:"best_candidate_addr,omitempty"`
|
||||
BestTransport string `json:"best_transport,omitempty"`
|
||||
BestReachability string `json:"best_reachability,omitempty"`
|
||||
BestConnectivity string `json:"best_connectivity,omitempty"`
|
||||
BestNATType string `json:"best_nat_type,omitempty"`
|
||||
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
|
||||
BestCandidateScore int `json:"best_candidate_score,omitempty"`
|
||||
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
|
||||
RelayNodeID string `json:"relay_node_id,omitempty"`
|
||||
RelayEndpoint string `json:"relay_endpoint,omitempty"`
|
||||
RelayControl bool `json:"relay_control"`
|
||||
NodeID string `json:"node_id"`
|
||||
RouteIDs []string `json:"route_ids,omitempty"`
|
||||
Endpoint string `json:"endpoint,omitempty"`
|
||||
EndpointCount int `json:"endpoint_count"`
|
||||
CandidateCount int `json:"candidate_count"`
|
||||
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
|
||||
RecoverySeed bool `json:"recovery_seed"`
|
||||
Warm bool `json:"warm"`
|
||||
WarmReason string `json:"warm_reason,omitempty"`
|
||||
BestCandidateID string `json:"best_candidate_id,omitempty"`
|
||||
BestCandidateAddr string `json:"best_candidate_addr,omitempty"`
|
||||
BestTransport string `json:"best_transport,omitempty"`
|
||||
BestReachability string `json:"best_reachability,omitempty"`
|
||||
BestConnectivity string `json:"best_connectivity,omitempty"`
|
||||
BestNATType string `json:"best_nat_type,omitempty"`
|
||||
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
|
||||
BestCandidateScore int `json:"best_candidate_score,omitempty"`
|
||||
EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates,omitempty"`
|
||||
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
|
||||
RelayNodeID string `json:"relay_node_id,omitempty"`
|
||||
RelayEndpoint string `json:"relay_endpoint,omitempty"`
|
||||
RelayControl bool `json:"relay_control"`
|
||||
}
|
||||
|
||||
type peerCacheBuildEntry struct {
|
||||
@@ -117,6 +118,10 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
|
||||
MaxVerificationAge: time.Hour,
|
||||
})
|
||||
if len(scored) > 0 {
|
||||
entry.EndpointCandidates = make([]PeerEndpointCandidate, 0, len(scored))
|
||||
for _, scoredCandidate := range scored {
|
||||
entry.EndpointCandidates = append(entry.EndpointCandidates, scoredCandidate.Candidate)
|
||||
}
|
||||
entry.BestCandidateID = scored[0].Candidate.EndpointID
|
||||
entry.BestCandidateAddr = scored[0].Candidate.Address
|
||||
entry.BestTransport = scored[0].Candidate.Transport
|
||||
|
||||
@@ -66,24 +66,44 @@ type PeerConnectionManagerSnapshot struct {
|
||||
}
|
||||
|
||||
type PeerConnectionProbeResult struct {
|
||||
NodeID string `json:"node_id"`
|
||||
LinkStatus string `json:"link_status"`
|
||||
Action string `json:"action"`
|
||||
Reason string `json:"reason"`
|
||||
Endpoint string `json:"endpoint,omitempty"`
|
||||
ConnectionState PeerConnectionState `json:"connection_state"`
|
||||
TransportMode string `json:"transport_mode"`
|
||||
RequiresRendezvous bool `json:"requires_rendezvous"`
|
||||
RendezvousResolved bool `json:"rendezvous_resolved"`
|
||||
DirectCandidate bool `json:"direct_candidate"`
|
||||
RelayCandidate bool `json:"relay_candidate"`
|
||||
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
|
||||
RelayNodeID string `json:"relay_node_id,omitempty"`
|
||||
RelayEndpoint string `json:"relay_endpoint,omitempty"`
|
||||
LatencyMs int `json:"latency_ms,omitempty"`
|
||||
FailureReason string `json:"failure_reason,omitempty"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
CompletedAt time.Time `json:"completed_at"`
|
||||
NodeID string `json:"node_id"`
|
||||
LinkStatus string `json:"link_status"`
|
||||
Action string `json:"action"`
|
||||
Reason string `json:"reason"`
|
||||
Endpoint string `json:"endpoint,omitempty"`
|
||||
SelectedCandidateID string `json:"selected_candidate_id,omitempty"`
|
||||
SelectedEndpoint string `json:"selected_endpoint,omitempty"`
|
||||
ConnectionState PeerConnectionState `json:"connection_state"`
|
||||
TransportMode string `json:"transport_mode"`
|
||||
RequiresRendezvous bool `json:"requires_rendezvous"`
|
||||
RendezvousResolved bool `json:"rendezvous_resolved"`
|
||||
DirectCandidate bool `json:"direct_candidate"`
|
||||
RelayCandidate bool `json:"relay_candidate"`
|
||||
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
|
||||
RelayNodeID string `json:"relay_node_id,omitempty"`
|
||||
RelayEndpoint string `json:"relay_endpoint,omitempty"`
|
||||
LatencyMs int `json:"latency_ms,omitempty"`
|
||||
FailureReason string `json:"failure_reason,omitempty"`
|
||||
CandidateResults []PeerConnectionCandidateProbeResult `json:"candidate_results,omitempty"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
CompletedAt time.Time `json:"completed_at"`
|
||||
}
|
||||
|
||||
type PeerConnectionCandidateProbeResult struct {
|
||||
CandidateID string `json:"candidate_id,omitempty"`
|
||||
Endpoint string `json:"endpoint"`
|
||||
Transport string `json:"transport,omitempty"`
|
||||
LinkStatus string `json:"link_status"`
|
||||
LatencyMs int `json:"latency_ms,omitempty"`
|
||||
FailureReason string `json:"failure_reason,omitempty"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
CompletedAt time.Time `json:"completed_at"`
|
||||
}
|
||||
|
||||
type peerConnectionProbeTarget struct {
|
||||
CandidateID string
|
||||
Endpoint string
|
||||
Transport string
|
||||
}
|
||||
|
||||
func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionManager {
|
||||
@@ -137,6 +157,10 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
|
||||
RendezvousLeases: rendezvousLeases,
|
||||
Now: startedAt,
|
||||
})
|
||||
entriesByNode := map[string]PeerCacheEntry{}
|
||||
for _, entry := range peerSnapshot.Entries {
|
||||
entriesByNode[entry.NodeID] = entry
|
||||
}
|
||||
cycle := PeerConnectionManagerCycle{
|
||||
Mode: recoveryPlan.Mode,
|
||||
StartedAt: startedAt,
|
||||
@@ -150,7 +174,7 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
|
||||
Results: make([]PeerConnectionProbeResult, 0, len(intentPlan.Intents)),
|
||||
}
|
||||
for _, intent := range intentPlan.Intents {
|
||||
result := m.probeIntent(ctx, intent)
|
||||
result := m.probeIntent(ctx, intent, entriesByNode[intent.NodeID])
|
||||
cycle.Results = append(cycle.Results, result)
|
||||
switch result.LinkStatus {
|
||||
case PeerConnectionProbeReachable:
|
||||
@@ -200,7 +224,7 @@ func (m *PeerConnectionManager) peerConfigSnapshot() (*PeerCache, []PeerRendezvo
|
||||
return m.peerCache, append([]PeerRendezvousLease{}, m.rendezvousLeases...)
|
||||
}
|
||||
|
||||
func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConnectionIntent) PeerConnectionProbeResult {
|
||||
func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConnectionIntent, cacheEntry PeerCacheEntry) PeerConnectionProbeResult {
|
||||
startedAt := normalizedNow(m.now())
|
||||
result := PeerConnectionProbeResult{
|
||||
NodeID: intent.NodeID,
|
||||
@@ -254,9 +278,6 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
|
||||
result.CompletedAt = normalizedNow(m.now())
|
||||
return result
|
||||
}
|
||||
m.tracker.BeginProbe(peer, startedAt)
|
||||
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
|
||||
defer cancel()
|
||||
target := PeerIdentity{
|
||||
ClusterID: m.local.ClusterID,
|
||||
NodeID: intent.NodeID,
|
||||
@@ -264,30 +285,118 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
|
||||
if intent.RelayCandidate && intent.RelayNodeID != "" {
|
||||
target.NodeID = intent.RelayNodeID
|
||||
}
|
||||
_, err := NewClient(strings.TrimRight(intent.Endpoint, "/")).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
|
||||
completedAt := normalizedNow(m.now())
|
||||
if err != nil {
|
||||
result.LinkStatus = PeerConnectionProbeUnreachable
|
||||
result.FailureReason = err.Error()
|
||||
result.ConnectionState = m.tracker.RecordFailure(intent.NodeID, err.Error(), completedAt)
|
||||
targets := []peerConnectionProbeTarget{{
|
||||
CandidateID: intent.BestCandidateID,
|
||||
Endpoint: intent.Endpoint,
|
||||
Transport: intent.Transport,
|
||||
}}
|
||||
if intent.DirectCandidate {
|
||||
targets = peerConnectionProbeTargets(intent, cacheEntry)
|
||||
}
|
||||
var lastFailure string
|
||||
for _, probeTarget := range targets {
|
||||
probePeer := peer
|
||||
probePeer.Endpoint = strings.TrimRight(strings.TrimSpace(probeTarget.Endpoint), "/")
|
||||
probePeer.BestCandidateID = strings.TrimSpace(probeTarget.CandidateID)
|
||||
probePeer.BestCandidateAddr = probePeer.Endpoint
|
||||
probePeer.BestTransport = strings.TrimSpace(probeTarget.Transport)
|
||||
if probePeer.Endpoint == "" {
|
||||
continue
|
||||
}
|
||||
candidateStartedAt := normalizedNow(m.now())
|
||||
m.tracker.BeginProbe(probePeer, candidateStartedAt)
|
||||
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
|
||||
_, err := NewClient(probePeer.Endpoint).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
|
||||
cancel()
|
||||
completedAt := normalizedNow(m.now())
|
||||
candidateResult := PeerConnectionCandidateProbeResult{
|
||||
CandidateID: probePeer.BestCandidateID,
|
||||
Endpoint: probePeer.Endpoint,
|
||||
Transport: probePeer.BestTransport,
|
||||
StartedAt: candidateStartedAt,
|
||||
CompletedAt: completedAt,
|
||||
}
|
||||
if err != nil {
|
||||
lastFailure = err.Error()
|
||||
candidateResult.LinkStatus = PeerConnectionProbeUnreachable
|
||||
candidateResult.FailureReason = lastFailure
|
||||
result.CandidateResults = append(result.CandidateResults, candidateResult)
|
||||
continue
|
||||
}
|
||||
latency := int(completedAt.Sub(candidateStartedAt).Milliseconds())
|
||||
if latency < 0 {
|
||||
latency = 0
|
||||
}
|
||||
candidateResult.LinkStatus = PeerConnectionProbeReachable
|
||||
candidateResult.LatencyMs = latency
|
||||
result.CandidateResults = append(result.CandidateResults, candidateResult)
|
||||
result.LinkStatus = PeerConnectionProbeReachable
|
||||
result.Endpoint = probePeer.Endpoint
|
||||
result.SelectedCandidateID = probePeer.BestCandidateID
|
||||
result.SelectedEndpoint = probePeer.Endpoint
|
||||
result.LatencyMs = latency
|
||||
if intent.RelayCandidate {
|
||||
result.ConnectionState = m.tracker.RecordRelayReady(probePeer, latency, completedAt)
|
||||
} else {
|
||||
result.ConnectionState = m.tracker.RecordSuccessForPeer(probePeer, latency, completedAt)
|
||||
}
|
||||
result.CompletedAt = completedAt
|
||||
return result
|
||||
}
|
||||
latency := int(completedAt.Sub(startedAt).Milliseconds())
|
||||
if latency < 0 {
|
||||
latency = 0
|
||||
}
|
||||
result.LinkStatus = PeerConnectionProbeReachable
|
||||
result.LatencyMs = latency
|
||||
if intent.RelayCandidate {
|
||||
result.ConnectionState = m.tracker.RecordRelayReady(peer, latency, completedAt)
|
||||
} else {
|
||||
result.ConnectionState = m.tracker.RecordSuccess(intent.NodeID, latency, completedAt)
|
||||
completedAt := normalizedNow(m.now())
|
||||
if lastFailure == "" {
|
||||
lastFailure = "no_probe_endpoint_available"
|
||||
}
|
||||
result.LinkStatus = PeerConnectionProbeUnreachable
|
||||
result.FailureReason = lastFailure
|
||||
result.ConnectionState = m.tracker.RecordFailure(intent.NodeID, lastFailure, completedAt)
|
||||
result.CompletedAt = completedAt
|
||||
return result
|
||||
}
|
||||
|
||||
func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) []peerConnectionProbeTarget {
|
||||
seen := map[string]struct{}{}
|
||||
out := make([]peerConnectionProbeTarget, 0, len(cacheEntry.EndpointCandidates)+1)
|
||||
add := func(candidateID, endpoint, transport string) {
|
||||
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
|
||||
if endpoint == "" {
|
||||
return
|
||||
}
|
||||
key := candidateID + "|" + endpoint
|
||||
if _, ok := seen[key]; ok {
|
||||
return
|
||||
}
|
||||
seen[key] = struct{}{}
|
||||
out = append(out, peerConnectionProbeTarget{
|
||||
CandidateID: strings.TrimSpace(candidateID),
|
||||
Endpoint: endpoint,
|
||||
Transport: strings.TrimSpace(transport),
|
||||
})
|
||||
}
|
||||
for _, candidate := range cacheEntry.EndpointCandidates {
|
||||
if !candidateUsableForDirectProbe(candidate) {
|
||||
continue
|
||||
}
|
||||
add(candidate.EndpointID, candidate.Address, candidate.Transport)
|
||||
}
|
||||
add(intent.BestCandidateID, intent.Endpoint, intent.Transport)
|
||||
return out
|
||||
}
|
||||
|
||||
func candidateUsableForDirectProbe(candidate PeerEndpointCandidate) bool {
|
||||
endpoint := strings.TrimSpace(candidate.Address)
|
||||
if endpoint == "" || strings.HasPrefix(endpoint, "relay://") || strings.HasPrefix(endpoint, "outbound://") {
|
||||
return false
|
||||
}
|
||||
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
|
||||
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
|
||||
transport := strings.ToLower(strings.TrimSpace(candidate.Transport))
|
||||
if connectivity == "outbound_only" || connectivity == "relay_required" || reachability == "outbound_only" || reachability == "relay" {
|
||||
return false
|
||||
}
|
||||
return transport == "" || strings.Contains(transport, "direct") || transport == "wss" || strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://")
|
||||
}
|
||||
|
||||
func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionState {
|
||||
snapshot := m.tracker.Snapshot()
|
||||
for _, entry := range snapshot.Entries {
|
||||
|
||||
@@ -188,3 +188,71 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
|
||||
t.Fatalf("unexpected tracker snapshot: %+v", snapshot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
|
||||
now := time.Date(2026, 4, 30, 12, 0, 0, 0, time.UTC)
|
||||
current := now
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
Local: local,
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
{
|
||||
EndpointID: "node-b-dead",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_http",
|
||||
Address: "http://127.0.0.1:1",
|
||||
Reachability: "private",
|
||||
ConnectivityMode: "private_lan",
|
||||
Priority: 1,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-live",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_http",
|
||||
Address: server.URL,
|
||||
Reachability: "private",
|
||||
ConnectivityMode: "private_lan",
|
||||
Priority: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
WarmPeerLimit: 1,
|
||||
Now: now,
|
||||
})
|
||||
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
|
||||
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
|
||||
Local: local,
|
||||
PeerCache: cache,
|
||||
Tracker: tracker,
|
||||
HTTPClient: &http.Client{Timeout: 100 * time.Millisecond},
|
||||
ProbeTimeout: 100 * time.Millisecond,
|
||||
Now: func() time.Time {
|
||||
current = current.Add(10 * time.Millisecond)
|
||||
return current
|
||||
},
|
||||
})
|
||||
|
||||
cycle := manager.ProbeOnce(context.Background())
|
||||
if cycle.Attempted != 1 || cycle.Succeeded != 1 || cycle.Failed != 0 || len(cycle.Results) != 1 {
|
||||
t.Fatalf("unexpected cycle: %+v", cycle)
|
||||
}
|
||||
result := cycle.Results[0]
|
||||
if result.LinkStatus != PeerConnectionProbeReachable || result.SelectedCandidateID != "node-b-live" || result.SelectedEndpoint != server.URL {
|
||||
t.Fatalf("fallback did not select live candidate: %+v", result)
|
||||
}
|
||||
if len(result.CandidateResults) != 2 ||
|
||||
result.CandidateResults[0].LinkStatus != PeerConnectionProbeUnreachable ||
|
||||
result.CandidateResults[1].LinkStatus != PeerConnectionProbeReachable {
|
||||
t.Fatalf("candidate probe trail mismatch: %+v", result.CandidateResults)
|
||||
}
|
||||
snapshot := tracker.Snapshot()
|
||||
if snapshot.Ready != 1 || len(snapshot.Entries) != 1 || snapshot.Entries[0].BestCandidateID != "node-b-live" || snapshot.Entries[0].Endpoint != server.URL {
|
||||
t.Fatalf("tracker did not retain selected candidate: %+v", snapshot)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -138,6 +138,32 @@ func (t *PeerConnectionTracker) RecordSuccess(nodeID string, latencyMs int, now
|
||||
return entry
|
||||
}
|
||||
|
||||
func (t *PeerConnectionTracker) RecordSuccessForPeer(peer PeerCacheEntry, latencyMs int, now time.Time) PeerConnectionState {
|
||||
if t == nil {
|
||||
return PeerConnectionState{}
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
now = normalizedNow(now)
|
||||
entry := t.entry(peer, now)
|
||||
entry.ConsecutiveSuccesses++
|
||||
entry.ConsecutiveFailures = 0
|
||||
entry.LastLatencyMs = latencyMs
|
||||
entry.LastFailureReason = ""
|
||||
entry.LastProbeAt = now
|
||||
entry.BackoffUntil = time.Time{}
|
||||
nextState := PeerConnectionReady
|
||||
if latencyMs >= 500 {
|
||||
nextState = PeerConnectionDegraded
|
||||
}
|
||||
if entry.State != nextState {
|
||||
entry.State = nextState
|
||||
entry.LastTransitionAt = now
|
||||
}
|
||||
t.entries[peer.NodeID] = entry
|
||||
return entry
|
||||
}
|
||||
|
||||
func (t *PeerConnectionTracker) RecordRelayReady(peer PeerCacheEntry, latencyMs int, now time.Time) PeerConnectionState {
|
||||
if t == nil {
|
||||
return PeerConnectionState{}
|
||||
|
||||
@@ -34,12 +34,20 @@ func ValidateProductionEnvelope(local PeerIdentity, envelope ProductionEnvelope,
|
||||
return err
|
||||
}
|
||||
}
|
||||
if envelope.ChannelClass != ProductionChannelFabricControl {
|
||||
maxPayloadBytes := MaxProductionEnvelopePayloadBytes
|
||||
switch envelope.ChannelClass {
|
||||
case ProductionChannelFabricControl:
|
||||
if envelope.MessageType != ProductionMessageFabricControl {
|
||||
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
case ProductionChannelVPNPacket:
|
||||
if envelope.MessageType != ProductionMessageVPNPacketBatch {
|
||||
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
maxPayloadBytes = MaxProductionVPNPacketPayloadBytes
|
||||
default:
|
||||
return ErrUnauthorizedChannel
|
||||
}
|
||||
if envelope.MessageType != ProductionMessageFabricControl {
|
||||
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if envelope.TTL <= 0 {
|
||||
return ErrTTLExhausted
|
||||
}
|
||||
@@ -58,8 +66,8 @@ func ValidateProductionEnvelope(local PeerIdentity, envelope ProductionEnvelope,
|
||||
if envelope.PayloadLength != len(envelope.Payload) {
|
||||
return fmt.Errorf("%w: payload_length mismatch", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if envelope.PayloadLength > MaxProductionEnvelopePayloadBytes {
|
||||
return fmt.Errorf("%w: payload exceeds fabric-control limit", ErrForwardEnvelopeInvalid)
|
||||
if envelope.PayloadLength > maxPayloadBytes {
|
||||
return fmt.Errorf("%w: payload exceeds channel limit", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if envelope.PayloadHash == "" {
|
||||
return fmt.Errorf("%w: payload_hash is required", ErrForwardEnvelopeInvalid)
|
||||
|
||||
@@ -22,7 +22,7 @@ func ValidateProductionEnvelopeRouteConfig(local PeerIdentity, envelope Producti
|
||||
if route.ExpiresAt.IsZero() || !route.ExpiresAt.After(now.UTC()) || envelope.ExpiresAt.After(route.ExpiresAt) {
|
||||
return ErrRouteExpired
|
||||
}
|
||||
if !contains(route.AllowedChannels, ProductionChannelFabricControl) {
|
||||
if !contains(route.AllowedChannels, envelope.ChannelClass) {
|
||||
return ErrUnauthorizedChannel
|
||||
}
|
||||
path := routePath(route)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -2,6 +2,8 @@ package supervisor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
|
||||
)
|
||||
@@ -17,24 +19,146 @@ type StubSupervisor struct {
|
||||
func (s StubSupervisor) Apply(_ context.Context, desired []client.DesiredWorkload) ([]client.WorkloadStatusRequest, error) {
|
||||
statuses := make([]client.WorkloadStatusRequest, 0, len(desired))
|
||||
for _, workload := range desired {
|
||||
state := "degraded"
|
||||
if workload.DesiredState == "disabled" {
|
||||
state = "stopped"
|
||||
}
|
||||
version := workload.Version
|
||||
if version == "" {
|
||||
version = s.Version
|
||||
}
|
||||
statuses = append(statuses, client.WorkloadStatusRequest{
|
||||
ReportedState: state,
|
||||
RuntimeMode: workload.RuntimeMode,
|
||||
Version: version,
|
||||
StatusPayload: map[string]any{
|
||||
"supervisor": "stub",
|
||||
"desired_state": workload.DesiredState,
|
||||
"service_type": workload.ServiceType,
|
||||
},
|
||||
})
|
||||
statuses = append(statuses, s.applyOne(workload))
|
||||
}
|
||||
return statuses, nil
|
||||
}
|
||||
|
||||
func (s StubSupervisor) applyOne(workload client.DesiredWorkload) client.WorkloadStatusRequest {
|
||||
serviceType := strings.TrimSpace(workload.ServiceType)
|
||||
desiredState := strings.TrimSpace(strings.ToLower(workload.DesiredState))
|
||||
if desiredState == "" {
|
||||
desiredState = "disabled"
|
||||
}
|
||||
runtimeMode := strings.TrimSpace(strings.ToLower(workload.RuntimeMode))
|
||||
if runtimeMode == "" {
|
||||
runtimeMode = "native"
|
||||
}
|
||||
version := strings.TrimSpace(workload.Version)
|
||||
if version == "" {
|
||||
version = s.Version
|
||||
}
|
||||
payload := map[string]any{
|
||||
"schema_version": "rap.node_agent.workload_supervision.v1",
|
||||
"supervisor": "node-agent-local",
|
||||
"desired_state": desiredState,
|
||||
"service_type": serviceType,
|
||||
"runtime_mode": runtimeMode,
|
||||
"observed_at": time.Now().UTC().Format(time.RFC3339Nano),
|
||||
}
|
||||
if desiredState != "enabled" {
|
||||
payload["reason"] = "desired_state_not_enabled"
|
||||
return client.WorkloadStatusRequest{
|
||||
ReportedState: "stopped",
|
||||
RuntimeMode: runtimeMode,
|
||||
Version: version,
|
||||
StatusPayload: payload,
|
||||
}
|
||||
}
|
||||
if serviceType == "core-mesh" || serviceType == "mesh-listener" {
|
||||
payload["reason"] = "builtin_node_agent_service_ready"
|
||||
payload["execution_mode"] = "builtin"
|
||||
payload["traffic"] = serviceTrafficMode(serviceType)
|
||||
return client.WorkloadStatusRequest{
|
||||
ReportedState: "running",
|
||||
RuntimeMode: runtimeMode,
|
||||
Version: version,
|
||||
StatusPayload: payload,
|
||||
}
|
||||
}
|
||||
if serviceType == "synthetic.echo" && runtimeMode == "native" {
|
||||
payload["reason"] = "internal_synthetic_echo_ready"
|
||||
payload["execution_mode"] = "builtin"
|
||||
payload["traffic"] = "test_service_only"
|
||||
return client.WorkloadStatusRequest{
|
||||
ReportedState: "running",
|
||||
RuntimeMode: runtimeMode,
|
||||
Version: version,
|
||||
StatusPayload: payload,
|
||||
}
|
||||
}
|
||||
if serviceType == "rdp-worker" && runtimeMode == "native" && boolConfig(workload.Config, "adapter_contract_probe") {
|
||||
payload["reason"] = "remote_workspace_adapter_contract_probe_ready"
|
||||
payload["execution_mode"] = "contract_probe"
|
||||
payload["service_class"] = "remote_workspace"
|
||||
payload["fabric_service_channel_required"] = true
|
||||
payload["backend_relay_steady_state"] = false
|
||||
payload["channels"] = remoteWorkspaceAdapterChannels()
|
||||
payload["frame_batch_contract"] = remoteWorkspaceFrameBatchContract()
|
||||
payload["traffic"] = "none"
|
||||
return client.WorkloadStatusRequest{
|
||||
ReportedState: "running",
|
||||
RuntimeMode: runtimeMode,
|
||||
Version: version,
|
||||
StatusPayload: payload,
|
||||
}
|
||||
}
|
||||
payload["reason"] = "service_runtime_not_implemented"
|
||||
payload["traffic"] = "blocked"
|
||||
return client.WorkloadStatusRequest{
|
||||
ReportedState: "degraded",
|
||||
RuntimeMode: runtimeMode,
|
||||
Version: version,
|
||||
StatusPayload: payload,
|
||||
}
|
||||
}
|
||||
|
||||
func boolConfig(values map[string]any, key string) bool {
|
||||
if values == nil {
|
||||
return false
|
||||
}
|
||||
value, ok := values[key]
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
switch typed := value.(type) {
|
||||
case bool:
|
||||
return typed
|
||||
case string:
|
||||
return strings.EqualFold(strings.TrimSpace(typed), "true")
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func remoteWorkspaceAdapterChannels() []map[string]any {
|
||||
return []map[string]any{
|
||||
{"name": "input", "direction": "client_to_adapter", "reliability": "reliable_ordered", "priority": "critical", "droppable": true, "may_block_input": false},
|
||||
{"name": "control", "direction": "bidirectional", "reliability": "reliable_ordered", "priority": "high", "droppable": false, "may_block_input": false},
|
||||
{"name": "display", "direction": "adapter_to_client", "reliability": "droppable_latest", "priority": "high", "droppable": true, "may_block_input": false},
|
||||
{"name": "cursor", "direction": "adapter_to_client", "reliability": "droppable_latest", "priority": "high", "droppable": true, "may_block_input": false},
|
||||
{"name": "clipboard", "direction": "bidirectional", "reliability": "reliable_ordered", "priority": "medium", "droppable": false, "may_block_input": false},
|
||||
{"name": "file_transfer", "direction": "bidirectional", "reliability": "reliable_chunked", "priority": "medium", "droppable": false, "may_block_input": false},
|
||||
{"name": "audio", "direction": "adapter_to_client", "reliability": "adaptive_droppable", "priority": "medium", "droppable": true, "may_block_input": false},
|
||||
{"name": "device", "direction": "bidirectional", "reliability": "reliable_ordered", "priority": "medium", "droppable": false, "may_block_input": false},
|
||||
{"name": "telemetry", "direction": "adapter_to_client", "reliability": "sampled_droppable", "priority": "low", "droppable": true, "may_block_input": false},
|
||||
}
|
||||
}
|
||||
|
||||
func remoteWorkspaceFrameBatchContract() map[string]any {
|
||||
return map[string]any{
|
||||
"schema_version": "rap.remote_workspace_frame_batch.v1",
|
||||
"adapter_contract_id": "rap.rdp_worker.remote_workspace_adapter_contract_probe.v1",
|
||||
"probe_only": true,
|
||||
"payload_forwarding": "not_implemented",
|
||||
"service_class": "remote_workspace",
|
||||
"allowed_flow_classes": []string{"control", "interactive", "reliable", "bulk", "droppable"},
|
||||
"allowed_payload_encodings": []string{
|
||||
"none",
|
||||
"base64",
|
||||
},
|
||||
"max_probe_frames": 32,
|
||||
"channels": remoteWorkspaceAdapterChannels(),
|
||||
}
|
||||
}
|
||||
|
||||
func serviceTrafficMode(serviceType string) string {
|
||||
switch serviceType {
|
||||
case "core-mesh":
|
||||
return "fabric_control"
|
||||
case "mesh-listener":
|
||||
return "entry_listener"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,3 +33,101 @@ func TestStubSupervisorReportsStoppedForDisabledWorkload(t *testing.T) {
|
||||
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStubSupervisorRunsInternalSyntheticEchoWorkload(t *testing.T) {
|
||||
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
|
||||
{ServiceType: "synthetic.echo", DesiredState: "enabled", RuntimeMode: "native"},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("apply desired workload: %v", err)
|
||||
}
|
||||
if statuses[0].ReportedState != "running" {
|
||||
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
|
||||
}
|
||||
if statuses[0].StatusPayload["reason"] != "internal_synthetic_echo_ready" {
|
||||
t.Fatalf("reason = %v", statuses[0].StatusPayload["reason"])
|
||||
}
|
||||
if statuses[0].StatusPayload["execution_mode"] != "builtin" {
|
||||
t.Fatalf("execution_mode = %v", statuses[0].StatusPayload["execution_mode"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestStubSupervisorReportsBuiltinFabricServicesRunning(t *testing.T) {
|
||||
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
|
||||
{ServiceType: "core-mesh", DesiredState: "enabled", RuntimeMode: "container"},
|
||||
{ServiceType: "mesh-listener", DesiredState: "enabled", RuntimeMode: "container"},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("apply desired workload: %v", err)
|
||||
}
|
||||
if len(statuses) != 2 {
|
||||
t.Fatalf("statuses length = %d", len(statuses))
|
||||
}
|
||||
for _, status := range statuses {
|
||||
if status.ReportedState != "running" {
|
||||
t.Fatalf("ReportedState = %q", status.ReportedState)
|
||||
}
|
||||
if status.StatusPayload["reason"] != "builtin_node_agent_service_ready" {
|
||||
t.Fatalf("reason = %v", status.StatusPayload["reason"])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestStubSupervisorKeepsUnsupportedEnabledWorkloadDegraded(t *testing.T) {
|
||||
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
|
||||
{ServiceType: "rdp-worker", DesiredState: "enabled", RuntimeMode: "container"},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("apply desired workload: %v", err)
|
||||
}
|
||||
if statuses[0].ReportedState != "degraded" {
|
||||
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
|
||||
}
|
||||
if statuses[0].StatusPayload["reason"] != "service_runtime_not_implemented" {
|
||||
t.Fatalf("reason = %v", statuses[0].StatusPayload["reason"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestStubSupervisorRunsRDPWorkerAdapterContractProbeOnly(t *testing.T) {
|
||||
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
|
||||
{
|
||||
ServiceType: "rdp-worker",
|
||||
DesiredState: "enabled",
|
||||
RuntimeMode: "native",
|
||||
Config: map[string]any{
|
||||
"adapter_contract_probe": true,
|
||||
},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("apply desired workload: %v", err)
|
||||
}
|
||||
if statuses[0].ReportedState != "running" {
|
||||
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
|
||||
}
|
||||
if statuses[0].StatusPayload["reason"] != "remote_workspace_adapter_contract_probe_ready" {
|
||||
t.Fatalf("reason = %v", statuses[0].StatusPayload["reason"])
|
||||
}
|
||||
if statuses[0].StatusPayload["service_class"] != "remote_workspace" {
|
||||
t.Fatalf("service_class = %v", statuses[0].StatusPayload["service_class"])
|
||||
}
|
||||
if statuses[0].StatusPayload["backend_relay_steady_state"] != false {
|
||||
t.Fatalf("backend_relay_steady_state = %v", statuses[0].StatusPayload["backend_relay_steady_state"])
|
||||
}
|
||||
channels, ok := statuses[0].StatusPayload["channels"].([]map[string]any)
|
||||
if !ok || len(channels) != 9 {
|
||||
t.Fatalf("channels = %#v", statuses[0].StatusPayload["channels"])
|
||||
}
|
||||
if channels[0]["name"] != "input" || channels[0]["priority"] != "critical" || channels[0]["droppable"] != true || channels[0]["may_block_input"] != false {
|
||||
t.Fatalf("unexpected input channel: %#v", channels[0])
|
||||
}
|
||||
frameBatch, ok := statuses[0].StatusPayload["frame_batch_contract"].(map[string]any)
|
||||
if !ok {
|
||||
t.Fatalf("frame_batch_contract = %#v", statuses[0].StatusPayload["frame_batch_contract"])
|
||||
}
|
||||
if frameBatch["schema_version"] != "rap.remote_workspace_frame_batch.v1" ||
|
||||
frameBatch["payload_forwarding"] != "not_implemented" ||
|
||||
frameBatch["service_class"] != "remote_workspace" {
|
||||
t.Fatalf("unexpected frame batch contract: %#v", frameBatch)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -385,32 +385,37 @@ func (s *FabricFlowScheduler) ConfigureAdaptivePolicy(policy FabricServiceChanne
|
||||
}
|
||||
|
||||
func (s *FabricFlowScheduler) ScheduleClientPackets(packets [][]byte) []FabricScheduledPacketBatch {
|
||||
return s.scheduleClientPackets("", "", packets)
|
||||
scheduled, _ := s.scheduleClientPackets("", "", packets)
|
||||
return scheduled
|
||||
}
|
||||
|
||||
func (s *FabricFlowScheduler) ScheduleClientPacketsForConnection(vpnConnectionID string, packets [][]byte) []FabricScheduledPacketBatch {
|
||||
return s.scheduleClientPackets(vpnConnectionID, "", packets)
|
||||
scheduled, _ := s.scheduleClientPackets(vpnConnectionID, "", packets)
|
||||
return scheduled
|
||||
}
|
||||
|
||||
func (s *FabricFlowScheduler) ScheduleClientPacketsForConnectionClass(vpnConnectionID string, trafficClass string, packets [][]byte) []FabricScheduledPacketBatch {
|
||||
return s.scheduleClientPackets(vpnConnectionID, trafficClass, packets)
|
||||
scheduled, _ := s.scheduleClientPackets(vpnConnectionID, trafficClass, packets)
|
||||
return scheduled
|
||||
}
|
||||
|
||||
func (s *FabricFlowScheduler) scheduleClientPackets(vpnConnectionID string, trafficClass string, packets [][]byte) []FabricScheduledPacketBatch {
|
||||
func (s *FabricFlowScheduler) scheduleClientPackets(vpnConnectionID string, trafficClass string, packets [][]byte) ([]FabricScheduledPacketBatch, uint64) {
|
||||
packets = cleanPacketBatch(packets)
|
||||
if len(packets) == 0 {
|
||||
return nil
|
||||
return nil, 0
|
||||
}
|
||||
if s == nil {
|
||||
s = NewFabricFlowScheduler(0, 0)
|
||||
}
|
||||
trafficClass = normalizeFabricTrafficClass(trafficClass)
|
||||
grouped := map[string]*FabricScheduledPacketBatch{}
|
||||
var droppedCount uint64
|
||||
for _, packet := range packets {
|
||||
flowID, shard := classifyPacketFlow(packet, s.shardCountValue())
|
||||
channelID := fabricFlowChannelIDForClass(vpnConnectionID, trafficClass, shard)
|
||||
queueDepth, dropped := s.enqueue(channelID, trafficClass)
|
||||
if dropped {
|
||||
droppedCount++
|
||||
continue
|
||||
}
|
||||
batch := grouped[channelID]
|
||||
@@ -433,7 +438,7 @@ func (s *FabricFlowScheduler) scheduleClientPackets(vpnConnectionID string, traf
|
||||
out = append(out, *batch)
|
||||
}
|
||||
s.sortScheduledBatches(out)
|
||||
return out
|
||||
return out, droppedCount
|
||||
}
|
||||
|
||||
func fabricFlowChannelID(vpnConnectionID string, shard int) string {
|
||||
@@ -1441,11 +1446,9 @@ func (i *FabricClientPacketIngress) SendClientPacketBatchWithTrafficClass(ctx co
|
||||
}
|
||||
i.recordSendBatch(len(packets))
|
||||
scheduler := i.flowScheduler()
|
||||
droppedBefore := scheduler.Dropped()
|
||||
scheduled := scheduler.ScheduleClientPacketsForConnectionClass(vpnConnectionID, trafficClass, packets)
|
||||
droppedAfter := scheduler.Dropped()
|
||||
if droppedAfter > droppedBefore {
|
||||
i.recordFlowDropped(droppedAfter - droppedBefore)
|
||||
scheduled, droppedCount := scheduler.scheduleClientPackets(vpnConnectionID, trafficClass, packets)
|
||||
if droppedCount > 0 {
|
||||
i.recordFlowDropped(droppedCount)
|
||||
}
|
||||
if len(scheduled) == 0 {
|
||||
i.recordError(mesh.ErrSyntheticRelayQueueFull)
|
||||
@@ -1657,8 +1660,10 @@ func (i *FabricClientPacketIngress) routeCandidatesWithPreference(clusterID stri
|
||||
if i == nil || routesFunc == nil {
|
||||
return nil
|
||||
}
|
||||
localClusterID := i.clusterID()
|
||||
localNodeID := i.localNodeID()
|
||||
if clusterID == "" {
|
||||
clusterID = i.ClusterID
|
||||
clusterID = localClusterID
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
var preferred []fabricClientRouteCandidate
|
||||
@@ -1676,7 +1681,7 @@ func (i *FabricClientPacketIngress) routeCandidatesWithPreference(clusterID stri
|
||||
}
|
||||
}
|
||||
for _, route := range routesFunc() {
|
||||
if route.ClusterID != clusterID || route.SourceNodeID != i.LocalNodeID || !containsString(route.AllowedChannels, mesh.ProductionChannelVPNPacket) {
|
||||
if route.ClusterID != clusterID || route.SourceNodeID != localNodeID || !containsString(route.AllowedChannels, mesh.ProductionChannelVPNPacket) {
|
||||
continue
|
||||
}
|
||||
if manager.isWithdrawn(route.RouteID) {
|
||||
@@ -1685,8 +1690,8 @@ func (i *FabricClientPacketIngress) routeCandidatesWithPreference(clusterID stri
|
||||
if !route.ExpiresAt.IsZero() && !route.ExpiresAt.After(now) {
|
||||
continue
|
||||
}
|
||||
nextHop := nextHopAfter(route.Hops, i.LocalNodeID, route.DestinationNodeID)
|
||||
if nextHop == "" || nextHop == i.LocalNodeID {
|
||||
nextHop := nextHopAfter(route.Hops, localNodeID, route.DestinationNodeID)
|
||||
if nextHop == "" || nextHop == localNodeID {
|
||||
continue
|
||||
}
|
||||
candidate := fabricClientRouteCandidate{Route: route, NextHop: nextHop}
|
||||
@@ -2024,7 +2029,7 @@ func (i *FabricClientPacketIngress) routeProvenance(clusterID string) map[string
|
||||
if i == nil || routesFunc == nil {
|
||||
return out
|
||||
}
|
||||
localNodeID := strings.TrimSpace(i.LocalNodeID)
|
||||
localNodeID := i.localNodeID()
|
||||
for _, route := range routesFunc() {
|
||||
if strings.TrimSpace(route.RouteID) == "" {
|
||||
continue
|
||||
@@ -2322,6 +2327,24 @@ func (i *FabricClientPacketIngress) routesFunc() func() []mesh.SyntheticRoute {
|
||||
return i.Routes
|
||||
}
|
||||
|
||||
func (i *FabricClientPacketIngress) clusterID() string {
|
||||
if i == nil {
|
||||
return ""
|
||||
}
|
||||
i.mu.Lock()
|
||||
defer i.mu.Unlock()
|
||||
return strings.TrimSpace(i.ClusterID)
|
||||
}
|
||||
|
||||
func (i *FabricClientPacketIngress) localNodeID() string {
|
||||
if i == nil {
|
||||
return ""
|
||||
}
|
||||
i.mu.Lock()
|
||||
defer i.mu.Unlock()
|
||||
return strings.TrimSpace(i.LocalNodeID)
|
||||
}
|
||||
|
||||
func (i *FabricClientPacketIngress) flowScheduler() *FabricFlowScheduler {
|
||||
if i == nil {
|
||||
return NewFabricFlowScheduler(0, 0)
|
||||
|
||||
@@ -324,10 +324,13 @@ func TestFabricFlowSchedulerDropsWhenChannelQueueIsFull(t *testing.T) {
|
||||
packetA := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
|
||||
packetB := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
|
||||
|
||||
batches := scheduler.ScheduleClientPackets([][]byte{packetA, packetB})
|
||||
batches, dropped := scheduler.scheduleClientPackets("", "", [][]byte{packetA, packetB})
|
||||
if len(batches) != 1 || len(batches[0].Packets) != 1 {
|
||||
t.Fatalf("batches = %#v, want one accepted packet", batches)
|
||||
}
|
||||
if dropped != 1 {
|
||||
t.Fatalf("dropped = %d, want per-call drop count 1", dropped)
|
||||
}
|
||||
snapshot := scheduler.Snapshot()
|
||||
if snapshot.Dropped != 1 || !snapshot.BackpressureActive {
|
||||
t.Fatalf("snapshot = %+v, want one dropped packet and active backpressure", snapshot)
|
||||
@@ -1069,6 +1072,60 @@ func TestFabricClientPacketIngressIsolatesRouteMemoryPerVPNConnection(t *testing
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricClientPacketIngressRouteSelectionUsesUpdatedRuntimeIdentity(t *testing.T) {
|
||||
transport := &captureManyProductionTransport{}
|
||||
ingress := &FabricClientPacketIngress{
|
||||
ForwardTransport: transport,
|
||||
Inbox: NewFabricPacketInbox(8),
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "entry-1",
|
||||
Routes: func() []mesh.SyntheticRoute {
|
||||
return []mesh.SyntheticRoute{{
|
||||
RouteID: "route-entry-1",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "entry-1",
|
||||
DestinationNodeID: "exit-1",
|
||||
Hops: []string{"entry-1", "relay-1", "exit-1"},
|
||||
AllowedChannels: []string{mesh.ProductionChannelVPNPacket},
|
||||
ExpiresAt: time.Now().UTC().Add(time.Minute),
|
||||
MaxTTL: 8,
|
||||
}}
|
||||
},
|
||||
}
|
||||
ingress.UpdateRuntime(
|
||||
transport,
|
||||
NewFabricPacketInbox(8),
|
||||
"cluster-1",
|
||||
"entry-2",
|
||||
nil,
|
||||
func() []mesh.SyntheticRoute {
|
||||
return []mesh.SyntheticRoute{{
|
||||
RouteID: "route-entry-2",
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: "entry-2",
|
||||
DestinationNodeID: "exit-2",
|
||||
Hops: []string{"entry-2", "relay-2", "exit-2"},
|
||||
AllowedChannels: []string{mesh.ProductionChannelVPNPacket},
|
||||
ExpiresAt: time.Now().UTC().Add(time.Minute),
|
||||
MaxTTL: 8,
|
||||
}}
|
||||
},
|
||||
"policy-updated",
|
||||
)
|
||||
|
||||
packet := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 443)
|
||||
if err := ingress.SendClientPacketBatch(context.Background(), "", "vpn-1", [][]byte{packet}); err != nil {
|
||||
t.Fatalf("send after runtime update: %v", err)
|
||||
}
|
||||
if len(transport.envelopes) != 1 {
|
||||
t.Fatalf("envelopes = %d, want one send", len(transport.envelopes))
|
||||
}
|
||||
envelope := transport.envelopes[0]
|
||||
if envelope.RouteID != "route-entry-2" || envelope.SourceNodeID != "entry-2" || transport.calls[0] != "relay-2" {
|
||||
t.Fatalf("envelope route/source/next-hop = %s/%s/%s, want updated entry-2 route", envelope.RouteID, envelope.SourceNodeID, transport.calls[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestFabricClientPacketIngressParallelFlowWindowDoesNotBlockIndependentChannel(t *testing.T) {
|
||||
scheduler := NewFabricFlowScheduler(8, 16)
|
||||
slowPacket, fastPacket := packetsForOrderedDistinctChannels(scheduler.shardCountValue())
|
||||
|
||||
@@ -0,0 +1,170 @@
|
||||
//go:build windows && rap_vpn_windows_tun
|
||||
|
||||
package vpnruntime
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
_ "embed"
|
||||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
wgtun "golang.zx2c4.com/wireguard/tun"
|
||||
)
|
||||
|
||||
const windowsGatewayMTU = 1420
|
||||
|
||||
//go:embed assets/windows/amd64/wintun.dll
|
||||
var embeddedWintunDLL []byte
|
||||
|
||||
type tunDevice struct {
|
||||
dev wgtun.Device
|
||||
name string
|
||||
}
|
||||
|
||||
func openGatewayTun(name, addressCIDR, routeCIDR string) (*tunDevice, error) {
|
||||
if _, _, err := net.ParseCIDR(addressCIDR); err != nil {
|
||||
return nil, fmt.Errorf("invalid vpn gateway address %q: %w", addressCIDR, err)
|
||||
}
|
||||
if _, _, err := net.ParseCIDR(routeCIDR); err != nil {
|
||||
return nil, fmt.Errorf("invalid vpn gateway route %q: %w", routeCIDR, err)
|
||||
}
|
||||
if err := ensureWintunDLL(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
dev, err := wgtun.CreateTUN(name, windowsGatewayMTU)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create wintun interface %s: %w", name, err)
|
||||
}
|
||||
if err := configureGatewayInterface(name, addressCIDR, routeCIDR); err != nil {
|
||||
_ = dev.Close()
|
||||
return nil, err
|
||||
}
|
||||
return &tunDevice{dev: dev, name: name}, nil
|
||||
}
|
||||
|
||||
func (d *tunDevice) Read(packet []byte) (int, error) {
|
||||
bufs := [][]byte{packet}
|
||||
sizes := []int{0}
|
||||
n, err := d.dev.Read(bufs, sizes, 0)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if n <= 0 {
|
||||
return 0, nil
|
||||
}
|
||||
return sizes[0], nil
|
||||
}
|
||||
|
||||
func (d *tunDevice) Write(packet []byte) (int, error) {
|
||||
n, err := d.dev.Write([][]byte{packet}, 0)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if n <= 0 {
|
||||
return 0, nil
|
||||
}
|
||||
return len(packet), nil
|
||||
}
|
||||
|
||||
func (d *tunDevice) Close() error {
|
||||
_ = removeWindowsGatewayNat()
|
||||
return d.dev.Close()
|
||||
}
|
||||
|
||||
func configureGatewayInterface(name, addressCIDR, routeCIDR string) error {
|
||||
ip, network, err := net.ParseCIDR(addressCIDR)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid vpn gateway address %q: %w", addressCIDR, err)
|
||||
}
|
||||
ones, bits := network.Mask.Size()
|
||||
if bits != 32 || ones <= 0 {
|
||||
return fmt.Errorf("invalid vpn gateway prefix %q", addressCIDR)
|
||||
}
|
||||
_, route, err := net.ParseCIDR(routeCIDR)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid vpn gateway route %q: %w", routeCIDR, err)
|
||||
}
|
||||
|
||||
script := fmt.Sprintf(`
|
||||
$ErrorActionPreference = 'Stop'
|
||||
$alias = %s
|
||||
$address = %s
|
||||
$prefixLength = %d
|
||||
$natPrefix = %s
|
||||
$natName = 'RAPVPN'
|
||||
$adapter = Get-NetAdapter -Name $alias -ErrorAction Stop
|
||||
$adapter | Enable-NetAdapter -Confirm:$false -ErrorAction SilentlyContinue | Out-Null
|
||||
$existing = Get-NetIPAddress -InterfaceAlias $alias -AddressFamily IPv4 -ErrorAction SilentlyContinue
|
||||
foreach ($addr in $existing) {
|
||||
if ($addr.IPAddress -ne $address -or $addr.PrefixLength -ne $prefixLength) {
|
||||
Remove-NetIPAddress -InterfaceAlias $alias -IPAddress $addr.IPAddress -Confirm:$false -ErrorAction SilentlyContinue
|
||||
}
|
||||
}
|
||||
if (-not (Get-NetIPAddress -InterfaceAlias $alias -IPAddress $address -AddressFamily IPv4 -ErrorAction SilentlyContinue)) {
|
||||
New-NetIPAddress -InterfaceAlias $alias -IPAddress $address -PrefixLength $prefixLength -Type Unicast | Out-Null
|
||||
}
|
||||
Set-NetIPInterface -InterfaceAlias $alias -AddressFamily IPv4 -Forwarding Enabled
|
||||
Get-NetIPInterface -AddressFamily IPv4 | Where-Object { $_.ConnectionState -eq 'Connected' -and $_.InterfaceAlias -ne 'Loopback Pseudo-Interface 1' } | Set-NetIPInterface -Forwarding Enabled
|
||||
$existingNat = Get-NetNat -Name $natName -ErrorAction SilentlyContinue
|
||||
if ($existingNat -and $existingNat.InternalIPInterfaceAddressPrefix -ne $natPrefix) {
|
||||
$existingNat | Remove-NetNat -Confirm:$false
|
||||
$existingNat = $null
|
||||
}
|
||||
if (-not $existingNat) {
|
||||
New-NetNat -Name $natName -InternalIPInterfaceAddressPrefix $natPrefix | Out-Null
|
||||
}
|
||||
`, psQuote(name), psQuote(ip.String()), ones, psQuote(route.String()))
|
||||
|
||||
if err := runPowerShell(script); err != nil {
|
||||
return fmt.Errorf("configure windows vpn gateway interface %s: %w", name, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func removeWindowsGatewayNat() error {
|
||||
return runPowerShell(`Get-NetNat -Name 'RAPVPN' -ErrorAction SilentlyContinue | Remove-NetNat -Confirm:$false -ErrorAction SilentlyContinue`)
|
||||
}
|
||||
|
||||
func runPowerShell(script string) error {
|
||||
cmd := exec.Command("powershell.exe", "-NoProfile", "-ExecutionPolicy", "Bypass", "-Command", script)
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("powershell failed: %w: %s", err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func psQuote(value string) string {
|
||||
return "'" + strings.ReplaceAll(value, "'", "''") + "'"
|
||||
}
|
||||
|
||||
func ensureWintunDLL() error {
|
||||
exePath, err := os.Executable()
|
||||
if err != nil {
|
||||
return fmt.Errorf("locate node-agent executable for wintun.dll: %w", err)
|
||||
}
|
||||
target := filepath.Join(filepath.Dir(exePath), "wintun.dll")
|
||||
if payload, err := os.ReadFile(target); err == nil && sameSHA256(payload, embeddedWintunDLL) {
|
||||
return nil
|
||||
}
|
||||
tmp := target + ".tmp"
|
||||
if err := os.WriteFile(tmp, embeddedWintunDLL, 0o644); err != nil {
|
||||
return fmt.Errorf("write embedded wintun.dll: %w", err)
|
||||
}
|
||||
_ = os.Remove(target)
|
||||
if err := os.Rename(tmp, target); err != nil {
|
||||
_ = os.Remove(tmp)
|
||||
return fmt.Errorf("install embedded wintun.dll: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func sameSHA256(a, b []byte) bool {
|
||||
left := sha256.Sum256(a)
|
||||
right := sha256.Sum256(b)
|
||||
return left == right
|
||||
}
|
||||
Reference in New Issue
Block a user