рабочий вариант, но скороть 10 МБит
build / backend (push) Has been cancelled
build / node-agent (push) Has been cancelled
build / worker (push) Has been cancelled

This commit is contained in:
2026-05-22 21:46:49 +03:00
parent 469fa0e860
commit 20d361a886
280 changed files with 954890 additions and 18524 deletions
+400
View File
@@ -0,0 +1,400 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package go;
import android.content.Context;
import java.lang.ref.PhantomReference;
import java.lang.ref.Reference;
import java.lang.ref.ReferenceQueue;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.IdentityHashMap;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Logger;
import go.Universe;
// Seq is a sequence of machine-dependent encoded values.
// Used by automatically generated language bindings to talk to Go.
public class Seq {
private static Logger log = Logger.getLogger("GoSeq");
// also known to bind/seq/ref.go and bind/objc/seq_darwin.m
private static final int NULL_REFNUM = 41;
// use single Ref for null Object
public static final Ref nullRef = new Ref(NULL_REFNUM, null);
// The singleton GoRefQueue
private static final GoRefQueue goRefQueue = new GoRefQueue();
static {
System.loadLibrary("gojni");
init();
Universe.touch();
}
// setContext sets the context in the go-library to be used in RunOnJvm.
public static void setContext(Context context) {
setContext((java.lang.Object)context);
}
private static native void init();
// Empty method to run class initializer
public static void touch() {}
private Seq() {
}
// ctx is an android.context.Context.
static native void setContext(java.lang.Object ctx);
public static void incRefnum(int refnum) {
tracker.incRefnum(refnum);
}
// incRef increments the reference count of Java objects.
// For proxies for Go objects, it calls into the Proxy method
// incRefnum() to make sure the Go reference count is positive
// even if the Proxy is garbage collected and its Ref is finalized.
public static int incRef(Object o) {
return tracker.inc(o);
}
public static int incGoObjectRef(GoObject o) {
return o.incRefnum();
}
// trackGoRef tracks a Go reference and decrements its refcount
// when the given GoObject wrapper is garbage collected.
//
// TODO(crawshaw): We could cut down allocations for frequently
// sent Go objects by maintaining a map to weak references. This
// however, would require allocating two objects per reference
// instead of one. It also introduces weak references, the bane
// of any Java debugging session.
//
// When we have real code, examine the tradeoffs.
public static void trackGoRef(int refnum, GoObject obj) {
if (refnum > 0) {
throw new RuntimeException("trackGoRef called with Java refnum " + refnum);
}
goRefQueue.track(refnum, obj);
}
public static Ref getRef(int refnum) {
return tracker.get(refnum);
}
// Increment the Go reference count before sending over a refnum.
// The ref parameter is only used to make sure the referenced
// object is not garbage collected before Go increments the
// count. It's the equivalent of Go's runtime.KeepAlive.
public static native void incGoRef(int refnum, GoObject ref);
// Informs the Go ref tracker that Java is done with this refnum.
static native void destroyRef(int refnum);
// decRef is called from seq.FinalizeRef
static void decRef(int refnum) {
tracker.dec(refnum);
}
// A GoObject is a Java class implemented in Go. When a GoObject
// is passed to Go, it is wrapped in a Go proxy, to make it behave
// the same as passing a regular Java class.
public interface GoObject {
// Increment refcount and return the refnum of the proxy.
//
// The Go reference count need to be bumped while the
// refnum is passed to Go, to avoid finalizing and
// invalidating it before being translated on the Go side.
int incRefnum();
}
// A Proxy is a Java object that proxies a Go object. Proxies, unlike
// GoObjects, are unwrapped to their Go counterpart when deserialized
// in Go.
public interface Proxy extends GoObject {}
// A Ref represents an instance of a Java object passed back and forth
// across the language boundary.
public static final class Ref {
public final int refnum;
private int refcnt; // Track how many times sent to Go.
public final Object obj; // The referenced Java obj.
Ref(int refnum, Object o) {
if (refnum < 0) {
throw new RuntimeException("Ref instantiated with a Go refnum " + refnum);
}
this.refnum = refnum;
this.refcnt = 0;
this.obj = o;
}
void inc() {
// Count how many times this ref's Java object is passed to Go.
if (refcnt == Integer.MAX_VALUE) {
throw new RuntimeException("refnum " + refnum + " overflow");
}
refcnt++;
}
}
static final RefTracker tracker = new RefTracker();
static final class RefTracker {
private static final int REF_OFFSET = 42;
// Next Java object reference number.
//
// Reference numbers are positive for Java objects,
// and start, arbitrarily at a different offset to Go
// to make debugging by reading Seq hex a little easier.
private int next = REF_OFFSET; // next Java object ref
// Java objects that have been passed to Go. refnum -> Ref
// The Ref obj field is non-null.
// This map pins Java objects so they don't get GCed while the
// only reference to them is held by Go code.
private final RefMap javaObjs = new RefMap();
// Java objects to refnum
private final IdentityHashMap<Object, Integer> javaRefs = new IdentityHashMap<>();
// inc increments the reference count of a Java object when it
// is sent to Go. inc returns the refnum for the object.
synchronized int inc(Object o) {
if (o == null) {
return NULL_REFNUM;
}
if (o instanceof Proxy) {
return ((Proxy)o).incRefnum();
}
Integer refnumObj = javaRefs.get(o);
if (refnumObj == null) {
if (next == Integer.MAX_VALUE) {
throw new RuntimeException("createRef overflow for " + o);
}
refnumObj = next++;
javaRefs.put(o, refnumObj);
}
int refnum = refnumObj;
Ref ref = javaObjs.get(refnum);
if (ref == null) {
ref = new Ref(refnum, o);
javaObjs.put(refnum, ref);
}
ref.inc();
return refnum;
}
synchronized void incRefnum(int refnum) {
Ref ref = javaObjs.get(refnum);
if (ref == null) {
throw new RuntimeException("referenced Java object is not found: refnum="+refnum);
}
ref.inc();
}
// dec decrements the reference count of a Java object when
// Go signals a corresponding proxy object is finalized.
// If the count reaches zero, the Java object is removed
// from the javaObjs map.
synchronized void dec(int refnum) {
if (refnum <= 0) {
// We don't keep track of the Go object.
// This must not happen.
log.severe("dec request for Go object "+ refnum);
return;
}
if (refnum == Seq.nullRef.refnum) {
return;
}
// Java objects are removed on request of Go.
Ref obj = javaObjs.get(refnum);
if (obj == null) {
throw new RuntimeException("referenced Java object is not found: refnum="+refnum);
}
obj.refcnt--;
if (obj.refcnt <= 0) {
javaObjs.remove(refnum);
javaRefs.remove(obj.obj);
}
}
// get returns an existing Ref to a Java object.
synchronized Ref get(int refnum) {
if (refnum < 0) {
throw new RuntimeException("ref called with Go refnum " + refnum);
}
if (refnum == NULL_REFNUM) {
return nullRef;
}
Ref ref = javaObjs.get(refnum);
if (ref == null) {
throw new RuntimeException("unknown java Ref: "+refnum);
}
return ref;
}
}
// GoRefQueue is a queue of GoRefs that are no longer live. An internal thread
// processes the queue and decrement the reference count on the Go side.
static class GoRefQueue extends ReferenceQueue<GoObject> {
// The set of tracked GoRefs. If we don't hold on to the GoRef instances, the Java GC
// will not add them to the queue when their referents are reclaimed.
private final Collection<GoRef> refs = Collections.synchronizedCollection(new HashSet<GoRef>());
void track(int refnum, GoObject obj) {
refs.add(new GoRef(refnum, obj, this));
}
GoRefQueue() {
Thread daemon = new Thread(new Runnable() {
@Override public void run() {
while (true) {
try {
GoRef ref = (GoRef)remove();
refs.remove(ref);
destroyRef(ref.refnum);
ref.clear();
} catch (InterruptedException e) {
// Ignore
}
}
}
});
daemon.setDaemon(true);
daemon.setName("GoRefQueue Finalizer Thread");
daemon.start();
}
}
// A GoRef is a PhantomReference to a Java proxy for a Go object.
// GoRefs are enqueued to the singleton GoRefQueue when no longer live,
// so the corresponding reference count can be decremented.
static class GoRef extends PhantomReference<GoObject> {
final int refnum;
GoRef(int refnum, GoObject obj, GoRefQueue q) {
super(obj, q);
if (refnum > 0) {
throw new RuntimeException("GoRef instantiated with a Java refnum " + refnum);
}
this.refnum = refnum;
}
}
// RefMap is a mapping of integers to Ref objects.
//
// The integers can be sparse. In Go this would be a map[int]*Ref.
static final class RefMap {
private int next = 0;
private int live = 0;
private int[] keys = new int[16];
private Ref[] objs = new Ref[16];
RefMap() {}
Ref get(int key) {
int i = Arrays.binarySearch(keys, 0, next, key);
if (i >= 0) {
return objs[i];
}
return null;
}
void remove(int key) {
int i = Arrays.binarySearch(keys, 0, next, key);
if (i >= 0) {
if (objs[i] != null) {
objs[i] = null;
live--;
}
}
}
void put(int key, Ref obj) {
if (obj == null) {
throw new RuntimeException("put a null ref (with key "+key+")");
}
int i = Arrays.binarySearch(keys, 0, next, key);
if (i >= 0) {
if (objs[i] == null) {
objs[i] = obj;
live++;
}
if (objs[i] != obj) {
throw new RuntimeException("replacing an existing ref (with key "+key+")");
}
return;
}
if (next >= keys.length) {
grow();
i = Arrays.binarySearch(keys, 0, next, key);
}
i = ~i;
if (i < next) {
// Insert, shift everything afterwards down.
System.arraycopy(keys, i, keys, i+1, next-i);
System.arraycopy(objs, i, objs, i+1, next-i);
}
keys[i] = key;
objs[i] = obj;
live++;
next++;
}
private void grow() {
// Compact and (if necessary) grow backing store.
int[] newKeys;
Ref[] newObjs;
int len = 2*roundPow2(live);
if (len > keys.length) {
newKeys = new int[keys.length*2];
newObjs = new Ref[objs.length*2];
} else {
newKeys = keys;
newObjs = objs;
}
int j = 0;
for (int i = 0; i < keys.length; i++) {
if (objs[i] != null) {
newKeys[j] = keys[i];
newObjs[j] = objs[i];
j++;
}
}
for (int i = j; i < newKeys.length; i++) {
newKeys[i] = 0;
newObjs[i] = null;
}
keys = newKeys;
objs = newObjs;
next = j;
if (live != next) {
throw new RuntimeException("bad state: live="+live+", next="+next);
}
}
private static int roundPow2(int x) {
int p = 1;
while (p < x) {
p *= 2;
}
return p;
}
}
}
@@ -0,0 +1,39 @@
// Code generated by gobind. DO NOT EDIT.
// Java class go.Universe is a proxy for talking to a Go program.
//
// autogenerated by gobind -lang=java -javapkg=su.cin.rapvpn.fabric
package go;
import go.Seq;
public abstract class Universe {
static {
Seq.touch(); // for loading the native library
_init();
}
private Universe() {} // uninstantiable
// touch is called from other bound packages to initialize this package
public static void touch() {}
private static native void _init();
private static final class proxyerror extends Exception implements Seq.Proxy, error {
private final int refnum;
@Override public final int incRefnum() {
Seq.incGoRef(refnum, this);
return refnum;
}
proxyerror(int refnum) { this.refnum = refnum; Seq.trackGoRef(refnum, this); }
@Override public String getMessage() { return error(); }
public native String error();
}
}
@@ -0,0 +1,14 @@
// Code generated by gobind. DO NOT EDIT.
// Java class go.error is a proxy for talking to a Go program.
//
// autogenerated by gobind -lang=java -javapkg=su.cin.rapvpn.fabric
package go;
import go.Seq;
public interface error {
public String error();
}
@@ -0,0 +1,38 @@
// Code generated by gobind. DO NOT EDIT.
// Java class su.cin.rapvpn.fabric.fabricvpn.Fabricvpn is a proxy for talking to a Go program.
//
// autogenerated by gobind -lang=java -javapkg=su.cin.rapvpn.fabric github.com/example/remote-access-platform/agents/rap-node-agent/mobile/fabricvpn
package su.cin.rapvpn.fabric.fabricvpn;
import go.Seq;
public abstract class Fabricvpn {
static {
Seq.touch(); // for loading the native library
_init();
}
private Fabricvpn() {} // uninstantiable
// touch is called from other bound packages to initialize this package
public static void touch() {}
private static native void _init();
private static final class proxySocketProtector implements Seq.Proxy, SocketProtector {
private final int refnum;
@Override public final int incRefnum() {
Seq.incGoRef(refnum, this);
return refnum;
}
proxySocketProtector(int refnum) { this.refnum = refnum; Seq.trackGoRef(refnum, this); }
public native boolean protect(long fd);
}
public static native Manager newManager();
}
@@ -0,0 +1,58 @@
// Code generated by gobind. DO NOT EDIT.
// Java class su.cin.rapvpn.fabric.fabricvpn.Manager is a proxy for talking to a Go program.
//
// autogenerated by gobind -lang=java -javapkg=su.cin.rapvpn.fabric github.com/example/remote-access-platform/agents/rap-node-agent/mobile/fabricvpn
package su.cin.rapvpn.fabric.fabricvpn;
import go.Seq;
public final class Manager implements Seq.Proxy {
static { Fabricvpn.touch(); }
private final int refnum;
@Override public final int incRefnum() {
Seq.incGoRef(refnum, this);
return refnum;
}
public Manager() {
this.refnum = __NewManager();
Seq.trackGoRef(refnum, this);
}
private static native int __NewManager();
Manager(int refnum) { this.refnum = refnum; Seq.trackGoRef(refnum, this); }
public native String controlRequest(String payloadJSON) throws Exception;
public native byte[] receivePacket(long timeoutMillis) throws Exception;
public native byte[] receivePacketBatchPayload(long timeoutMillis) throws Exception;
public native void reconnect() throws Exception;
public native void sendPacket(byte[] packet) throws Exception;
public native void sendPacketBatchPayload(byte[] payload) throws Exception;
public native void setSocketProtector(SocketProtector protector);
public native String snapshotJSON();
public native void start(String configJSON) throws Exception;
public native void stop();
public native void updateRuntimeConfig(String configJSON) throws Exception;
@Override public boolean equals(Object o) {
if (o == null || !(o instanceof Manager)) {
return false;
}
Manager that = (Manager)o;
return true;
}
@Override public int hashCode() {
return java.util.Arrays.hashCode(new Object[] {});
}
@Override public String toString() {
StringBuilder b = new StringBuilder();
b.append("Manager").append("{");
return b.append("}").toString();
}
}
@@ -0,0 +1,14 @@
// Code generated by gobind. DO NOT EDIT.
// Java class su.cin.rapvpn.fabric.fabricvpn.SocketProtector is a proxy for talking to a Go program.
//
// autogenerated by gobind -lang=java -javapkg=su.cin.rapvpn.fabric github.com/example/remote-access-platform/agents/rap-node-agent/mobile/fabricvpn
package su.cin.rapvpn.fabric.fabricvpn;
import go.Seq;
public interface SocketProtector {
public boolean protect(long fd);
}
@@ -0,0 +1,164 @@
// Code generated by gobind. DO NOT EDIT.
// JNI functions for the Go <=> Java bridge.
//
// autogenerated by gobind -lang=java -javapkg=su.cin.rapvpn.fabric github.com/example/remote-access-platform/agents/rap-node-agent/mobile/fabricvpn
#include <android/log.h>
#include <stdint.h>
#include "seq.h"
#include "_cgo_export.h"
#include "fabricvpn.h"
jclass proxy_class_fabricvpn_SocketProtector;
jmethodID proxy_class_fabricvpn_SocketProtector_cons;
static jmethodID mid_SocketProtector_Protect;
jclass proxy_class_fabricvpn_Manager;
jmethodID proxy_class_fabricvpn_Manager_cons;
JNIEXPORT void JNICALL
Java_su_cin_rapvpn_fabric_fabricvpn_Fabricvpn__1init(JNIEnv *env, jclass _unused) {
jclass clazz;
clazz = (*env)->FindClass(env, "su/cin/rapvpn/fabric/fabricvpn/Manager");
proxy_class_fabricvpn_Manager = (*env)->NewGlobalRef(env, clazz);
proxy_class_fabricvpn_Manager_cons = (*env)->GetMethodID(env, clazz, "<init>", "(I)V");
clazz = (*env)->FindClass(env, "su/cin/rapvpn/fabric/fabricvpn/Fabricvpn$proxySocketProtector");
proxy_class_fabricvpn_SocketProtector = (*env)->NewGlobalRef(env, clazz);
proxy_class_fabricvpn_SocketProtector_cons = (*env)->GetMethodID(env, clazz, "<init>", "(I)V");
clazz = (*env)->FindClass(env, "su/cin/rapvpn/fabric/fabricvpn/SocketProtector");
mid_SocketProtector_Protect = (*env)->GetMethodID(env, clazz, "protect", "(J)Z");
}
JNIEXPORT jobject JNICALL
Java_su_cin_rapvpn_fabric_fabricvpn_Fabricvpn_newManager(JNIEnv* env, jclass _clazz) {
int32_t r0 = proxyfabricvpn__NewManager();
jobject _r0 = go_seq_from_refnum(env, r0, proxy_class_fabricvpn_Manager, proxy_class_fabricvpn_Manager_cons);
return _r0;
}
JNIEXPORT jint JNICALL
Java_su_cin_rapvpn_fabric_fabricvpn_Manager__1_1NewManager(JNIEnv *env, jclass clazz) {
int32_t refnum = proxyfabricvpn__NewManager();
return refnum;
}
JNIEXPORT jstring JNICALL
Java_su_cin_rapvpn_fabric_fabricvpn_Manager_controlRequest(JNIEnv* env, jobject __this__, jstring payloadJSON) {
int32_t o = go_seq_to_refnum_go(env, __this__);
nstring _payloadJSON = go_seq_from_java_string(env, payloadJSON);
struct proxyfabricvpn_Manager_ControlRequest_return res = proxyfabricvpn_Manager_ControlRequest(o, _payloadJSON);
jstring _r0 = go_seq_to_java_string(env, res.r0);
jobject _r1 = go_seq_from_refnum(env, res.r1, proxy_class__error, proxy_class__error_cons);
go_seq_maybe_throw_exception(env, _r1);
return _r0;
}
JNIEXPORT jbyteArray JNICALL
Java_su_cin_rapvpn_fabric_fabricvpn_Manager_receivePacket(JNIEnv* env, jobject __this__, jlong timeoutMillis) {
int32_t o = go_seq_to_refnum_go(env, __this__);
nint _timeoutMillis = (nint)timeoutMillis;
struct proxyfabricvpn_Manager_ReceivePacket_return res = proxyfabricvpn_Manager_ReceivePacket(o, _timeoutMillis);
jbyteArray _r0 = go_seq_to_java_bytearray(env, res.r0, 1);
jobject _r1 = go_seq_from_refnum(env, res.r1, proxy_class__error, proxy_class__error_cons);
go_seq_maybe_throw_exception(env, _r1);
return _r0;
}
JNIEXPORT jbyteArray JNICALL
Java_su_cin_rapvpn_fabric_fabricvpn_Manager_receivePacketBatchPayload(JNIEnv* env, jobject __this__, jlong timeoutMillis) {
int32_t o = go_seq_to_refnum_go(env, __this__);
nint _timeoutMillis = (nint)timeoutMillis;
struct proxyfabricvpn_Manager_ReceivePacketBatchPayload_return res = proxyfabricvpn_Manager_ReceivePacketBatchPayload(o, _timeoutMillis);
jbyteArray _r0 = go_seq_to_java_bytearray(env, res.r0, 1);
jobject _r1 = go_seq_from_refnum(env, res.r1, proxy_class__error, proxy_class__error_cons);
go_seq_maybe_throw_exception(env, _r1);
return _r0;
}
JNIEXPORT void JNICALL
Java_su_cin_rapvpn_fabric_fabricvpn_Manager_reconnect(JNIEnv* env, jobject __this__) {
int32_t o = go_seq_to_refnum_go(env, __this__);
int32_t r0 = proxyfabricvpn_Manager_Reconnect(o);
jobject _r0 = go_seq_from_refnum(env, r0, proxy_class__error, proxy_class__error_cons);
go_seq_maybe_throw_exception(env, _r0);
}
JNIEXPORT void JNICALL
Java_su_cin_rapvpn_fabric_fabricvpn_Manager_sendPacket(JNIEnv* env, jobject __this__, jbyteArray packet) {
int32_t o = go_seq_to_refnum_go(env, __this__);
nbyteslice _packet = go_seq_from_java_bytearray(env, packet, 0);
int32_t r0 = proxyfabricvpn_Manager_SendPacket(o, _packet);
go_seq_release_byte_array(env, packet, _packet.ptr);
jobject _r0 = go_seq_from_refnum(env, r0, proxy_class__error, proxy_class__error_cons);
go_seq_maybe_throw_exception(env, _r0);
}
JNIEXPORT void JNICALL
Java_su_cin_rapvpn_fabric_fabricvpn_Manager_sendPacketBatchPayload(JNIEnv* env, jobject __this__, jbyteArray payload) {
int32_t o = go_seq_to_refnum_go(env, __this__);
nbyteslice _payload = go_seq_from_java_bytearray(env, payload, 0);
int32_t r0 = proxyfabricvpn_Manager_SendPacketBatchPayload(o, _payload);
go_seq_release_byte_array(env, payload, _payload.ptr);
jobject _r0 = go_seq_from_refnum(env, r0, proxy_class__error, proxy_class__error_cons);
go_seq_maybe_throw_exception(env, _r0);
}
JNIEXPORT void JNICALL
Java_su_cin_rapvpn_fabric_fabricvpn_Manager_setSocketProtector(JNIEnv* env, jobject __this__, jobject protector) {
int32_t o = go_seq_to_refnum_go(env, __this__);
int32_t _protector = go_seq_to_refnum(env, protector);
proxyfabricvpn_Manager_SetSocketProtector(o, _protector);
}
JNIEXPORT jstring JNICALL
Java_su_cin_rapvpn_fabric_fabricvpn_Manager_snapshotJSON(JNIEnv* env, jobject __this__) {
int32_t o = go_seq_to_refnum_go(env, __this__);
nstring r0 = proxyfabricvpn_Manager_SnapshotJSON(o);
jstring _r0 = go_seq_to_java_string(env, r0);
return _r0;
}
JNIEXPORT void JNICALL
Java_su_cin_rapvpn_fabric_fabricvpn_Manager_start(JNIEnv* env, jobject __this__, jstring configJSON) {
int32_t o = go_seq_to_refnum_go(env, __this__);
nstring _configJSON = go_seq_from_java_string(env, configJSON);
int32_t r0 = proxyfabricvpn_Manager_Start(o, _configJSON);
jobject _r0 = go_seq_from_refnum(env, r0, proxy_class__error, proxy_class__error_cons);
go_seq_maybe_throw_exception(env, _r0);
}
JNIEXPORT void JNICALL
Java_su_cin_rapvpn_fabric_fabricvpn_Manager_stop(JNIEnv* env, jobject __this__) {
int32_t o = go_seq_to_refnum_go(env, __this__);
proxyfabricvpn_Manager_Stop(o);
}
JNIEXPORT void JNICALL
Java_su_cin_rapvpn_fabric_fabricvpn_Manager_updateRuntimeConfig(JNIEnv* env, jobject __this__, jstring configJSON) {
int32_t o = go_seq_to_refnum_go(env, __this__);
nstring _configJSON = go_seq_from_java_string(env, configJSON);
int32_t r0 = proxyfabricvpn_Manager_UpdateRuntimeConfig(o, _configJSON);
jobject _r0 = go_seq_from_refnum(env, r0, proxy_class__error, proxy_class__error_cons);
go_seq_maybe_throw_exception(env, _r0);
}
JNIEXPORT jboolean JNICALL
Java_su_cin_rapvpn_fabric_fabricvpn_Fabricvpn_00024proxySocketProtector_protect(JNIEnv* env, jobject __this__, jlong fd) {
int32_t o = go_seq_to_refnum_go(env, __this__);
int64_t _fd = (int64_t)fd;
char r0 = proxyfabricvpn_SocketProtector_Protect(o, _fd);
jboolean _r0 = r0 ? JNI_TRUE : JNI_FALSE;
return _r0;
}
char cproxyfabricvpn_SocketProtector_Protect(int32_t refnum, int64_t fd) {
JNIEnv *env = go_seq_push_local_frame(1);
jobject o = go_seq_from_refnum(env, refnum, proxy_class_fabricvpn_SocketProtector, proxy_class_fabricvpn_SocketProtector_cons);
jlong _fd = (jlong)fd;
jboolean res = (*env)->CallBooleanMethod(env, o, mid_SocketProtector_Protect, _fd);
char _res = (char)res;
go_seq_pop_local_frame(env);
return _res;
}
@@ -0,0 +1,19 @@
// Code generated by gobind. DO NOT EDIT.
// JNI function headers for the Go <=> Java bridge.
//
// autogenerated by gobind -lang=java -javapkg=su.cin.rapvpn.fabric github.com/example/remote-access-platform/agents/rap-node-agent/mobile/fabricvpn
#ifndef __Fabricvpn_H__
#define __Fabricvpn_H__
#include <jni.h>
extern jclass proxy_class_fabricvpn_SocketProtector;
extern jmethodID proxy_class_fabricvpn_SocketProtector_cons;
char cproxyfabricvpn_SocketProtector_Protect(int32_t refnum, int64_t fd);
extern jclass proxy_class_fabricvpn_Manager;
extern jmethodID proxy_class_fabricvpn_Manager_cons;
#endif
@@ -0,0 +1,401 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// C support functions for bindings. This file is copied into the
// generated gomobile_bind package and compiled along with the
// generated binding files.
#include <android/log.h>
#include <errno.h>
#include <jni.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#include "seq.h"
#include "_cgo_export.h"
#define NULL_REFNUM 41
// initClasses are only exported from Go if reverse bindings are used.
// If they're not, weakly define a no-op function.
__attribute__((weak)) void initClasses(void) { }
static JavaVM *jvm;
// jnienvs holds the per-thread JNIEnv* for Go threads where we called AttachCurrentThread.
// A pthread key destructor is supplied to call DetachCurrentThread on exit. This trick is
// documented in http://developer.android.com/training/articles/perf-jni.html under "Threads".
static pthread_key_t jnienvs;
static jclass seq_class;
static jmethodID seq_getRef;
static jmethodID seq_decRef;
static jmethodID seq_incRef;
static jmethodID seq_incGoObjectRef;
static jmethodID seq_incRefnum;
static jfieldID ref_objField;
static jclass throwable_class;
// env_destructor is registered as a thread data key destructor to
// clean up a Go thread that is attached to the JVM.
static void env_destructor(void *env) {
if ((*jvm)->DetachCurrentThread(jvm) != JNI_OK) {
LOG_INFO("failed to detach current thread");
}
}
static JNIEnv *go_seq_get_thread_env(void) {
JNIEnv *env;
jint ret = (*jvm)->GetEnv(jvm, (void **)&env, JNI_VERSION_1_6);
if (ret != JNI_OK) {
if (ret != JNI_EDETACHED) {
LOG_FATAL("failed to get thread env");
}
if ((*jvm)->AttachCurrentThread(jvm, &env, NULL) != JNI_OK) {
LOG_FATAL("failed to attach current thread");
}
pthread_setspecific(jnienvs, env);
}
return env;
}
void go_seq_maybe_throw_exception(JNIEnv *env, jobject exc) {
if (exc != NULL) {
(*env)->Throw(env, exc);
}
}
jobject go_seq_get_exception(JNIEnv *env) {
jthrowable exc = (*env)->ExceptionOccurred(env);
if (!exc) {
return NULL;
}
(*env)->ExceptionClear(env);
return exc;
}
jbyteArray go_seq_to_java_bytearray(JNIEnv *env, nbyteslice s, int copy) {
if (s.ptr == NULL) {
return NULL;
}
jbyteArray res = (*env)->NewByteArray(env, s.len);
if (res == NULL) {
LOG_FATAL("NewByteArray failed");
}
(*env)->SetByteArrayRegion(env, res, 0, s.len, s.ptr);
if (copy) {
free(s.ptr);
}
return res;
}
#define surr1 0xd800
#define surr2 0xdc00
#define surr3 0xe000
// Unicode replacement character
#define replacementChar 0xFFFD
#define rune1Max ((1<<7) - 1)
#define rune2Max ((1<<11) - 1)
#define rune3Max ((1<<16) - 1)
// Maximum valid Unicode code point.
#define MaxRune 0x0010FFFF
#define surrogateMin 0xD800
#define surrogateMax 0xDFFF
// 0011 1111
#define maskx 0x3F
// 1000 0000
#define tx 0x80
// 1100 0000
#define t2 0xC0
// 1110 0000
#define t3 0xE0
// 1111 0000
#define t4 0xF0
// encode_rune writes into p (which must be large enough) the UTF-8 encoding
// of the rune. It returns the number of bytes written.
static int encode_rune(uint8_t *p, uint32_t r) {
if (r <= rune1Max) {
p[0] = (uint8_t)r;
return 1;
} else if (r <= rune2Max) {
p[0] = t2 | (uint8_t)(r>>6);
p[1] = tx | (((uint8_t)(r))&maskx);
return 2;
} else {
if (r > MaxRune || (surrogateMin <= r && r <= surrogateMax)) {
r = replacementChar;
}
if (r <= rune3Max) {
p[0] = t3 | (uint8_t)(r>>12);
p[1] = tx | (((uint8_t)(r>>6))&maskx);
p[2] = tx | (((uint8_t)(r))&maskx);
return 3;
} else {
p[0] = t4 | (uint8_t)(r>>18);
p[1] = tx | (((uint8_t)(r>>12))&maskx);
p[2] = tx | (((uint8_t)(r>>6))&maskx);
p[3] = tx | (((uint8_t)(r))&maskx);
return 4;
}
}
}
// utf16_decode decodes an array of UTF16 characters to a UTF-8 encoded
// nstring copy. The support functions and utf16_decode itself are heavily
// based on the unicode/utf8 and unicode/utf16 Go packages.
static nstring utf16_decode(jchar *chars, jsize len) {
jsize worstCaseLen = 4*len;
uint8_t *buf = malloc(worstCaseLen);
if (buf == NULL) {
LOG_FATAL("utf16Decode: malloc failed");
}
jsize nsrc = 0;
jsize ndst = 0;
while (nsrc < len) {
uint32_t r = chars[nsrc];
nsrc++;
if (surr1 <= r && r < surr2 && nsrc < len) {
uint32_t r2 = chars[nsrc];
if (surr2 <= r2 && r2 < surr3) {
nsrc++;
r = (((r-surr1)<<10) | (r2 - surr2)) + 0x10000;
}
}
if (ndst + 4 > worstCaseLen) {
LOG_FATAL("utf16Decode: buffer overflow");
}
ndst += encode_rune(buf + ndst, r);
}
struct nstring res = {.chars = buf, .len = ndst};
return res;
}
nstring go_seq_from_java_string(JNIEnv *env, jstring str) {
struct nstring res = {NULL, 0};
if (str == NULL) {
return res;
}
jsize nchars = (*env)->GetStringLength(env, str);
if (nchars == 0) {
return res;
}
jchar *chars = (jchar *)(*env)->GetStringChars(env, str, NULL);
if (chars == NULL) {
LOG_FATAL("GetStringChars failed");
}
nstring nstr = utf16_decode(chars, nchars);
(*env)->ReleaseStringChars(env, str, chars);
return nstr;
}
nbyteslice go_seq_from_java_bytearray(JNIEnv *env, jbyteArray arr, int copy) {
struct nbyteslice res = {NULL, 0};
if (arr == NULL) {
return res;
}
jsize len = (*env)->GetArrayLength(env, arr);
if (len == 0) {
return res;
}
jbyte *ptr = (*env)->GetByteArrayElements(env, arr, NULL);
if (ptr == NULL) {
LOG_FATAL("GetByteArrayElements failed");
}
if (copy) {
void *ptr_copy = (void *)malloc(len);
if (ptr_copy == NULL) {
LOG_FATAL("malloc failed");
}
memcpy(ptr_copy, ptr, len);
(*env)->ReleaseByteArrayElements(env, arr, ptr, JNI_ABORT);
ptr = (jbyte *)ptr_copy;
}
res.ptr = ptr;
res.len = len;
return res;
}
int32_t go_seq_to_refnum_go(JNIEnv *env, jobject o) {
if (o == NULL) {
return NULL_REFNUM;
}
return (int32_t)(*env)->CallStaticIntMethod(env, seq_class, seq_incGoObjectRef, o);
}
int32_t go_seq_to_refnum(JNIEnv *env, jobject o) {
if (o == NULL) {
return NULL_REFNUM;
}
return (int32_t)(*env)->CallStaticIntMethod(env, seq_class, seq_incRef, o);
}
int32_t go_seq_unwrap(jint refnum) {
JNIEnv *env = go_seq_push_local_frame(0);
jobject jobj = go_seq_from_refnum(env, refnum, NULL, NULL);
int32_t goref = go_seq_to_refnum_go(env, jobj);
go_seq_pop_local_frame(env);
return goref;
}
jobject go_seq_from_refnum(JNIEnv *env, int32_t refnum, jclass proxy_class, jmethodID proxy_cons) {
if (refnum == NULL_REFNUM) {
return NULL;
}
if (refnum < 0) { // Go object
// return new <Proxy>(refnum)
return (*env)->NewObject(env, proxy_class, proxy_cons, refnum);
}
// Seq.Ref ref = Seq.getRef(refnum)
jobject ref = (*env)->CallStaticObjectMethod(env, seq_class, seq_getRef, (jint)refnum);
if (ref == NULL) {
LOG_FATAL("Unknown reference: %d", refnum);
}
// Go incremented the reference count just before passing the refnum. Decrement it here.
(*env)->CallStaticVoidMethod(env, seq_class, seq_decRef, (jint)refnum);
// return ref.obj
return (*env)->GetObjectField(env, ref, ref_objField);
}
// go_seq_to_java_string converts a nstring to a jstring.
jstring go_seq_to_java_string(JNIEnv *env, nstring str) {
jstring s = (*env)->NewString(env, str.chars, str.len/2);
if (str.chars != NULL) {
free(str.chars);
}
return s;
}
// go_seq_push_local_frame retrieves or creates the JNIEnv* for the current thread
// and pushes a JNI reference frame. Must be matched with call to go_seq_pop_local_frame.
JNIEnv *go_seq_push_local_frame(jint nargs) {
JNIEnv *env = go_seq_get_thread_env();
// Given the number of function arguments, compute a conservative bound for the minimal frame size.
// Assume two slots for each per parameter (Seq.Ref and Seq.Object) and add extra
// extra space for the receiver, the return value, and exception (if any).
jint frameSize = 2*nargs + 10;
if ((*env)->PushLocalFrame(env, frameSize) < 0) {
LOG_FATAL("PushLocalFrame failed");
}
return env;
}
// Pop the current local frame, freeing all JNI local references in it
void go_seq_pop_local_frame(JNIEnv *env) {
(*env)->PopLocalFrame(env, NULL);
}
void go_seq_inc_ref(int32_t ref) {
JNIEnv *env = go_seq_get_thread_env();
(*env)->CallStaticVoidMethod(env, seq_class, seq_incRefnum, (jint)ref);
}
void go_seq_dec_ref(int32_t ref) {
JNIEnv *env = go_seq_get_thread_env();
(*env)->CallStaticVoidMethod(env, seq_class, seq_decRef, (jint)ref);
}
JNIEXPORT void JNICALL
Java_go_Seq_init(JNIEnv *env, jclass clazz) {
if ((*env)->GetJavaVM(env, &jvm) != 0) {
LOG_FATAL("failed to get JVM");
}
if (pthread_key_create(&jnienvs, env_destructor) != 0) {
LOG_FATAL("failed to initialize jnienvs thread local storage");
}
seq_class = (*env)->NewGlobalRef(env, clazz);
seq_getRef = (*env)->GetStaticMethodID(env, seq_class, "getRef", "(I)Lgo/Seq$Ref;");
if (seq_getRef == NULL) {
LOG_FATAL("failed to find method Seq.getRef");
}
seq_decRef = (*env)->GetStaticMethodID(env, seq_class, "decRef", "(I)V");
if (seq_decRef == NULL) {
LOG_FATAL("failed to find method Seq.decRef");
}
seq_incRefnum = (*env)->GetStaticMethodID(env, seq_class, "incRefnum", "(I)V");
if (seq_incRefnum == NULL) {
LOG_FATAL("failed to find method Seq.incRefnum");
}
seq_incRef = (*env)->GetStaticMethodID(env, seq_class, "incRef", "(Ljava/lang/Object;)I");
if (seq_incRef == NULL) {
LOG_FATAL("failed to find method Seq.incRef");
}
seq_incGoObjectRef = (*env)->GetStaticMethodID(env, seq_class, "incGoObjectRef", "(Lgo/Seq$GoObject;)I");
if (seq_incGoObjectRef == NULL) {
LOG_FATAL("failed to find method Seq.incGoObjectRef");
}
jclass ref_class = (*env)->FindClass(env, "go/Seq$Ref");
if (ref_class == NULL) {
LOG_FATAL("failed to find the Seq.Ref class");
}
ref_objField = (*env)->GetFieldID(env, ref_class, "obj", "Ljava/lang/Object;");
if (ref_objField == NULL) {
LOG_FATAL("failed to find the Seq.Ref.obj field");
}
initClasses();
}
JNIEXPORT void JNICALL
Java_go_Seq_destroyRef(JNIEnv *env, jclass clazz, jint refnum) {
DestroyRef(refnum);
}
JNIEXPORT void JNICALL
Java_go_Seq_incGoRef(JNIEnv *env, jclass clazz, jint refnum, jobject ref) {
IncGoRef(refnum);
}
jclass go_seq_find_class(const char *name) {
JNIEnv *env = go_seq_push_local_frame(0);
jclass clazz = (*env)->FindClass(env, name);
if (clazz == NULL) {
(*env)->ExceptionClear(env);
} else {
clazz = (*env)->NewGlobalRef(env, clazz);
}
go_seq_pop_local_frame(env);
return clazz;
}
jmethodID go_seq_get_static_method_id(jclass clazz, const char *name, const char *sig) {
JNIEnv *env = go_seq_push_local_frame(0);
jmethodID m = (*env)->GetStaticMethodID(env, clazz, name, sig);
if (m == NULL) {
(*env)->ExceptionClear(env);
}
go_seq_pop_local_frame(env);
return m;
}
jmethodID go_seq_get_method_id(jclass clazz, const char *name, const char *sig) {
JNIEnv *env = go_seq_push_local_frame(0);
jmethodID m = (*env)->GetMethodID(env, clazz, name, sig);
if (m == NULL) {
(*env)->ExceptionClear(env);
}
go_seq_pop_local_frame(env);
return m;
}
void go_seq_release_byte_array(JNIEnv *env, jbyteArray arr, jbyte* ptr) {
if (ptr != NULL) {
(*env)->ReleaseByteArrayElements(env, arr, ptr, 0);
}
}
int go_seq_isinstanceof(jint refnum, jclass clazz) {
JNIEnv *env = go_seq_push_local_frame(0);
jobject obj = go_seq_from_refnum(env, refnum, NULL, NULL);
jboolean isinst = (*env)->IsInstanceOf(env, obj, clazz);
go_seq_pop_local_frame(env);
return isinst;
}
@@ -0,0 +1,98 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package main
// Go support functions for bindings. This file is copied into the
// generated main package and compiled along with the generated binding
// files.
//#cgo CFLAGS: -Werror
//#cgo LDFLAGS: -llog
//#include <jni.h>
//#include <stdint.h>
//#include <stdlib.h>
//#include "seq_android.h"
import "C"
import (
"unsafe"
"golang.org/x/mobile/bind/seq"
)
// DestroyRef is called by Java to inform Go it is done with a reference.
//export DestroyRef
func DestroyRef(refnum C.int32_t) {
seq.Delete(int32(refnum))
}
// encodeString returns a copy of a Go string as a UTF16 encoded nstring.
// The returned data is freed in go_seq_to_java_string.
//
// encodeString uses UTF16 as the intermediate format. Note that UTF8 is an obvious
// alternative, but JNI only supports a C-safe variant of UTF8 (modified UTF8).
func encodeString(s string) C.nstring {
n := C.int(len(s))
if n == 0 {
return C.nstring{}
}
// Allocate enough for the worst case estimate, every character is a surrogate pair
worstCaseLen := 4 * len(s)
utf16buf := C.malloc(C.size_t(worstCaseLen))
if utf16buf == nil {
panic("encodeString: malloc failed")
}
chars := (*[1<<30 - 1]uint16)(unsafe.Pointer(utf16buf))[:worstCaseLen/2 : worstCaseLen/2]
nchars := seq.UTF16Encode(s, chars)
return C.nstring{chars: unsafe.Pointer(utf16buf), len: C.jsize(nchars*2)}
}
// decodeString decodes a UTF8 encoded nstring to a Go string. The data
// in str is freed after use.
func decodeString(str C.nstring) string {
if str.chars == nil {
return ""
}
chars := (*[1<<31 - 1]byte)(str.chars)[:str.len]
s := string(chars)
C.free(str.chars)
return s
}
// fromSlice converts a slice to a jbyteArray cast as a nbyteslice. If cpy
// is set, the returned slice is a copy to be free by go_seq_to_java_bytearray.
func fromSlice(s []byte, cpy bool) C.nbyteslice {
if s == nil || len(s) == 0 {
return C.nbyteslice{}
}
var ptr *C.jbyte
n := C.jsize(len(s))
if cpy {
ptr = (*C.jbyte)(C.malloc(C.size_t(n)))
if ptr == nil {
panic("fromSlice: malloc failed")
}
copy((*[1<<31 - 1]byte)(unsafe.Pointer(ptr))[:n], s)
} else {
ptr = (*C.jbyte)(unsafe.Pointer(&s[0]))
}
return C.nbyteslice{ptr: unsafe.Pointer(ptr), len: n}
}
// toSlice takes a nbyteslice (jbyteArray) and returns a byte slice
// with the data. If cpy is set, the slice contains a copy of the data and is
// freed.
func toSlice(s C.nbyteslice, cpy bool) []byte {
if s.ptr == nil || s.len == 0 {
return nil
}
var b []byte
if cpy {
b = C.GoBytes(s.ptr, C.int(s.len))
C.free(s.ptr)
} else {
b = (*[1<<31 - 1]byte)(unsafe.Pointer(s.ptr))[:s.len:s.len]
}
return b
}
@@ -0,0 +1,67 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef __GO_SEQ_ANDROID_HDR__
#define __GO_SEQ_ANDROID_HDR__
#include <stdint.h>
#include <android/log.h>
// For abort()
#include <stdlib.h>
#include <jni.h>
#define LOG_INFO(...) __android_log_print(ANDROID_LOG_INFO, "go/Seq", __VA_ARGS__)
#define LOG_FATAL(...) \
{ \
__android_log_print(ANDROID_LOG_FATAL, "go/Seq", __VA_ARGS__); \
abort(); \
}
// Platform specific types
typedef struct nstring {
// UTF16 or UTF8 Encoded string. When converting from Java string to Go
// string, UTF16. When converting from Go to Java, UTF8.
void *chars;
// length in bytes, regardless of encoding
jsize len;
} nstring;
typedef struct nbyteslice {
void *ptr;
jsize len;
} nbyteslice;
typedef jlong nint;
extern void go_seq_dec_ref(int32_t ref);
extern void go_seq_inc_ref(int32_t ref);
// go_seq_unwrap takes a reference number to a Java wrapper and returns
// a reference number to its wrapped Go object.
extern int32_t go_seq_unwrap(jint refnum);
extern int32_t go_seq_to_refnum(JNIEnv *env, jobject o);
extern int32_t go_seq_to_refnum_go(JNIEnv *env, jobject o);
extern jobject go_seq_from_refnum(JNIEnv *env, int32_t refnum, jclass proxy_class, jmethodID proxy_cons);
extern void go_seq_maybe_throw_exception(JNIEnv *env, jobject msg);
// go_seq_get_exception returns any pending exception and clears the exception status.
extern jobject go_seq_get_exception(JNIEnv *env);
extern jbyteArray go_seq_to_java_bytearray(JNIEnv *env, nbyteslice s, int copy);
extern nbyteslice go_seq_from_java_bytearray(JNIEnv *env, jbyteArray s, int copy);
extern void go_seq_release_byte_array(JNIEnv *env, jbyteArray arr, jbyte* ptr);
extern jstring go_seq_to_java_string(JNIEnv *env, nstring str);
extern nstring go_seq_from_java_string(JNIEnv *env, jstring s);
// push_local_frame retrieves or creates the JNIEnv* for the current thread
// and pushes a JNI reference frame. Must be matched with call to pop_local_frame.
extern JNIEnv *go_seq_push_local_frame(jint cap);
// Pop the current local frame, releasing all JNI local references in it
extern void go_seq_pop_local_frame(JNIEnv *env);
// Return a global reference to the given class. Return NULL and clear exception if not found.
extern jclass go_seq_find_class(const char *name);
extern jmethodID go_seq_get_static_method_id(jclass clazz, const char *name, const char *sig);
extern jmethodID go_seq_get_method_id(jclass clazz, const char *name, const char *sig);
extern int go_seq_isinstanceof(jint refnum, jclass clazz);
#endif // __GO_SEQ_ANDROID_HDR__
@@ -0,0 +1,43 @@
// Code generated by gobind. DO NOT EDIT.
// JNI functions for the Go <=> Java bridge.
//
// autogenerated by gobind -lang=java -javapkg=su.cin.rapvpn.fabric
#include <android/log.h>
#include <stdint.h>
#include "seq.h"
#include "_cgo_export.h"
#include "universe.h"
jclass proxy_class__error;
jmethodID proxy_class__error_cons;
static jmethodID mid_error_Error;
JNIEXPORT void JNICALL
Java_go_Universe__1init(JNIEnv *env, jclass _unused) {
jclass clazz;
clazz = (*env)->FindClass(env, "go/Universe$proxyerror");
proxy_class__error = (*env)->NewGlobalRef(env, clazz);
proxy_class__error_cons = (*env)->GetMethodID(env, clazz, "<init>", "(I)V");
clazz = (*env)->FindClass(env, "java/lang/Throwable");
mid_error_Error = (*env)->GetMethodID(env, clazz, "getMessage", "()Ljava/lang/String;");
}
JNIEXPORT jstring JNICALL
Java_go_Universe_00024proxyerror_error(JNIEnv* env, jobject __this__) {
int32_t o = go_seq_to_refnum_go(env, __this__);
nstring r0 = proxy_error_Error(o);
jstring _r0 = go_seq_to_java_string(env, r0);
return _r0;
}
nstring cproxy_error_Error(int32_t refnum) {
JNIEnv *env = go_seq_push_local_frame(0);
jobject o = go_seq_from_refnum(env, refnum, proxy_class__error, proxy_class__error_cons);
jstring res = (*env)->CallObjectMethod(env, o, mid_error_Error);
nstring _res = go_seq_from_java_string(env, res);
go_seq_pop_local_frame(env);
return _res;
}
@@ -0,0 +1,17 @@
// Code generated by gobind. DO NOT EDIT.
// JNI function headers for the Go <=> Java bridge.
//
// autogenerated by gobind -lang=java -javapkg=su.cin.rapvpn.fabric
#ifndef __Universe_H__
#define __Universe_H__
#include <jni.h>
extern jclass proxy_class__error;
extern jmethodID proxy_class__error_cons;
nstring cproxy_error_Error(int32_t refnum);
#endif
+65 -77
View File
@@ -34,10 +34,10 @@ Implemented:
- reliable fabric/control queue rejection when full
- bounded non-production `synthetic.echo` test-service path
- direct, single-relay, and forced-fallback test-service proofs
- live HTTP peer transport for synthetic mesh envelopes
- disabled-by-default synthetic mesh HTTP endpoint in `rap-node-agent`
- live QUIC peer transport for synthetic mesh envelopes
- disabled-by-default synthetic mesh QUIC endpoint in `rap-node-agent`
- `mesh-live-smoke` harness proving direct and single-relay synthetic traffic
over real local HTTP endpoints
over real local QUIC endpoints
- scoped synthetic mesh config file loading for peer endpoints and routes
- Control Plane synthetic mesh config read fallback when no local scoped config
file is set
@@ -46,7 +46,7 @@ Implemented:
- explicit production mesh forwarding gate config; production forwarding still
has no runtime implementation and remains unavailable
- route-bound production mesh envelope contract and fail-closed validation on
`/mesh/v1/forward`
the QUIC production-forward path
- metadata-only production envelope observation hook for valid envelopes, still
without forwarding payloads
- bounded metadata-only production envelope observation sink for accepted
@@ -93,7 +93,7 @@ Implemented:
- bounded peer recovery planner over peer cache and connection states
- peer connection intent planner with transport readiness classification
- peer connection manager for real control-plane health over reusable
HTTP keep-alive transport
QUIC fabric transport
- route-health effective-path runtime through replacement relay control paths
Not implemented yet:
@@ -125,35 +125,30 @@ state directory. On Linux it also installs a systemd `update-loop` service by
default, so nodes continue to update from Control Plane policy without operator
commands on each host.
Preferred profile-based install:
Preferred fabric-native install:
```bash
rap-host-agent install \
--profile-url https://control.example.com/api/v1 \
--cluster-id <cluster_id> \
--install-token <one_time_install_token> \
--node-name docker-node-1
--bootstrap-bundle ./docker-node-1.bootstrap.json
```
The host-agent exchanges the install token for a signed control-plane install
profile, then applies Docker image, container, state-dir, mesh listen,
advertise, NAT/connectivity, and region settings from that profile. The same
token is then used by the node-agent for first enrollment, so the operator does
not need to manually pass cluster/runtime flags.
Offline/import bootstrap is also supported:
```bash
rap-host-agent install \
--bootstrap-bundle ./docker-node-1.bootstrap.json
```
The bootstrap bundle carries the signed install profile, pinned cluster
authority key, and QUIC fabric registry seeds. The host-agent applies Docker
image, container, state-dir, mesh listen, advertise, NAT/connectivity, and
region settings locally, then the node-agent enrolls through QUIC fabric.
Manual install is still supported:
```bash
rap-host-agent install \
--backend-url http://192.168.200.61:18080/api/v1 \
--cluster-id <cluster_id> \
--join-token <raw_join_token> \
--node-name docker-node-1 \
--image rap-node-agent:dev-enrollment-bootstrap-smoke \
--container-name rap-node-agent-docker-node-1 \
--state-dir /var/lib/rap/nodes/docker-node-1 \
--network host \
--replace
--bootstrap-bundle ./docker-node-1.bootstrap.json
```
The command creates or replaces only the local Docker container. The running
@@ -175,8 +170,6 @@ local updater service without recreating the node-agent container:
```bash
rap-host-agent install-updater \
--backend-url http://192.168.200.61:18080/api/v1 \
--cluster-id <cluster_id> \
--state-dir /var/lib/rap/nodes/docker-node-1 \
--container-name rap-node-agent-docker-node-1
```
@@ -191,7 +184,6 @@ container is running, and reports update phases back to the Control Plane.
```bash
rap-host-agent update \
--backend-url http://192.168.200.61:18080/api/v1 \
--cluster-id <cluster_id> \
--node-id <node_id> \
--container-name rap-node-agent-docker-node-1 \
@@ -215,7 +207,6 @@ already-installed release.
```bash
rap-host-agent update-loop \
--backend-url http://192.168.200.61:18080/api/v1 \
--cluster-id <cluster_id> \
--node-id <node_id> \
--container-name rap-node-agent-docker-node-1 \
@@ -241,7 +232,6 @@ the new binary.
```bash
rap-host-agent update-host-agent-loop \
--backend-url http://192.168.200.61:18080/api/v1 \
--cluster-id <cluster_id> \
--state-dir /var/lib/rap/nodes/docker-node-1 \
--binary-path /usr/local/bin/rap-host-agent
@@ -249,16 +239,21 @@ rap-host-agent update-host-agent-loop \
## Windows Host Agent Bootstrap And Updates
Windows uses the same Control Plane install profile, but the local placement is
a Scheduled Task instead of Docker. In `--startup-mode auto` the installer first
Windows uses the same bootstrap bundle model, but the local placement is a
Scheduled Task instead of Docker. In `--startup-mode auto` the installer first
tries an elevated `ONSTART` task running as `SYSTEM`; without admin rights it
falls back to a per-user `ONLOGON` task. The `ONSTART` mode starts after reboot
without an interactive user session. The `ONLOGON` fallback can only start after
that Windows user signs in.
```cmd
powershell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -UseBasicParsing 'http://control.example.com/downloads/rap-host-agent-windows-amd64.exe' -OutFile $env:TEMP\rap-host-agent.exe"
%TEMP%\rap-host-agent.exe install-windows --profile-url "http://control.example.com/api/v1" --cluster-id "<cluster_id>" --install-token "<one_time_install_token>" --node-name "office-win-1" --startup-mode "auto"
%TEMP%\rap-host-agent.exe install-windows --bootstrap-bundle "C:\bootstrap\office-win-1.bootstrap.json" --startup-mode "auto"
```
Offline/import bootstrap is also supported:
```cmd
%TEMP%\rap-host-agent.exe install-windows --bootstrap-bundle "C:\bootstrap\office-win-1.bootstrap.json" --startup-mode "auto"
```
`install-windows` installs two tasks:
@@ -275,9 +270,8 @@ independent from the local identity file location and is required for repair of
older Windows installs where the node is already heartbeat-healthy but the
host-agent updater has no usable identity file.
```cmd
%TEMP%\rap-host-agent.exe install-windows --backend-url "http://control.example.com/api/v1" --cluster-id "<cluster_id>" --node-id "<node_id>" --node-name "office-win-1" --replace --startup-mode "auto" --auto-update-current-version "<current_version>"
```
The repair path also reuses the local signed bootstrap/runtime state; it does
not require any backend URL.
The admin UI node details page generates a downloadable
`rap-repair-updater-<node>.cmd` for this repair path. It performs these steps:
@@ -347,14 +341,8 @@ Control Plane release artifacts for Windows must use:
Create a join token from the platform control plane, then run:
```powershell
.\bin\rap-node-agent.exe `
-backend-url http://192.168.200.61:8080/api/v1 `
-cluster-id <cluster_id> `
-join-token <raw_join_token> `
-node-name test-node-1 `
-state-dir C:\ProgramData\RapNodeAgent
```
Use a signed bootstrap bundle plus QUIC fabric registry seeds. The node
enrolls only through QUIC fabric inside the farm.
The agent submits a pending join request and exits. It does not self-activate.
A platform admin must approve the join request.
@@ -375,19 +363,18 @@ Then run the agent again:
```powershell
.\bin\rap-node-agent.exe `
-backend-url http://192.168.200.61:8080/api/v1 `
-state-dir C:\ProgramData\RapNodeAgent
```
It sends periodic heartbeats to:
It sends periodic heartbeats through the signed `control-api` service over QUIC
fabric:
```text
/api/v1/clusters/{clusterID}/nodes/{nodeID}/heartbeats
fabric control path /clusters/{clusterID}/nodes/{nodeID}/heartbeats
```
## Environment Variables
- `RAP_BACKEND_URL`
- `RAP_CLUSTER_ID`
- `RAP_CLUSTER_AUTHORITY_PUBLIC_KEY`
- `RAP_CLUSTER_AUTHORITY_FINGERPRINT`
@@ -398,8 +385,8 @@ It sends periodic heartbeats to:
- `RAP_HEARTBEAT_INTERVAL_SECONDS`
- `RAP_ENROLLMENT_POLL_INTERVAL_SECONDS`
- `RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS`
- `RAP_MESH_SYNTHETIC_RUNTIME_ENABLED`
- `RAP_MESH_LISTEN_ADDR`
- `RAP_FABRIC_RUNTIME_ENABLED`
- `RAP_FABRIC_LISTEN_ADDR`
- `RAP_MESH_ADVERTISE_ENDPOINT`
- `RAP_MESH_ADVERTISE_ENDPOINTS_JSON`
- `RAP_MESH_ADVERTISE_TRANSPORT`
@@ -412,15 +399,15 @@ It sends periodic heartbeats to:
- `RAP_MESH_PRODUCTION_FORWARDING_ENABLED`
- `RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY`
`RAP_MESH_SYNTHETIC_RUNTIME_ENABLED` defaults to `false`. It gates only the
`RAP_FABRIC_RUNTIME_ENABLED` defaults to `false`. It gates only the
C17A/C17B/C17C/C17D/C17E synthetic probe, route-health, relay scheduling,
bounded `synthetic.echo` test-service runtime, and live synthetic HTTP endpoint.
bounded `synthetic.echo` test-service runtime, and live synthetic QUIC endpoint.
It must not be used for RDP, VPN, file, video, or other production service
traffic.
`RAP_WORKLOAD_SUPERVISION_ENABLED` defaults to `false`. When enabled, the agent
polls node-scoped desired workloads and reports status. The current bounded
runtime reports built-in `core-mesh` and `mesh-listener` services as running
runtime reports built-in `core-mesh` and `fabric-listener` services as running
when enabled, supports the native built-in `synthetic.echo` test workload, and
keeps unsupported production workloads such as RDP workers degraded until their
supervisors are implemented.
@@ -431,8 +418,9 @@ reports the remote-workspace adapter channel contract and requires Fabric
Service Channel as the future data plane; it does not start FreeRDP, create a
remote session, or carry production RDP payloads.
`RAP_MESH_LISTEN_ADDR` starts the C17E/C17F/C17G synthetic HTTP endpoint only when
`RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true`. `RAP_MESH_SYNTHETIC_CONFIG` points to
`RAP_FABRIC_LISTEN_ADDR` names the historical synthetic listener address, but the
current runtime is QUIC-fabric-only and does not start an HTTP listener.
`RAP_MESH_SYNTHETIC_CONFIG` points to
a scoped synthetic mesh config snapshot and is preferred over debug JSON.
`RAP_MESH_PEER_ENDPOINTS_JSON` is a JSON object mapping peer node IDs to
endpoint URLs. `RAP_MESH_SYNTHETIC_ROUTES_JSON` is a JSON array of synthetic
@@ -454,10 +442,9 @@ same fields in `identity.json` are set.
`RAP_MESH_PRODUCTION_FORWARDING_ENABLED` defaults to `false`. It is a future
production-forwarding gate only. Turning it on does not enable production mesh
payload forwarding; `/mesh/v1/forward` still returns an unavailable runtime
response after validating the route-bound production envelope contract, until
a later approved production mesh stage implements route-bound, policy-bound
forwarding.
payload forwarding; the runtime still refuses service traffic after validating
the route-bound production envelope contract, until a later approved
production mesh stage implements route-bound, policy-bound forwarding.
The production envelope contract requires route, hop, TTL, expiry, payload
length, and SHA-256 payload hash fields. C17J accepts only the
@@ -522,11 +509,11 @@ recent failure reason, reliability score, and freshness/staleness signals.
The score remains advisory only and is not wired into production forwarding.
C17Z adds the first narrow production forwarding runtime. When
`RAP_MESH_PRODUCTION_FORWARDING_ENABLED=true`, `/mesh/v1/forward` can deliver
route-bound `fabric.control` envelopes at the local destination or forward them
to a direct next hop from explicit peer endpoint config. Service channels,
RDP/VPN/file/video payloads, arbitrary relay forwarding, and multi-hop
production route execution remain unavailable.
`RAP_MESH_PRODUCTION_FORWARDING_ENABLED=true`, the QUIC production-forward
handler can deliver route-bound `fabric.control` envelopes at the local
destination or forward them to a direct next hop from explicit peer endpoint
config. Service channels, RDP/VPN/file/video payloads, arbitrary relay
forwarding, and multi-hop production route execution remain unavailable.
C17Z1 adds route-path-bound multi-hop forwarding for production
`fabric.control` only. Envelopes may carry `route_path` and
@@ -559,7 +546,7 @@ C17Z5 turns scoped peer directory and recovery seed config into node-local
runtime `PeerCache` state. The cache builds a bounded warm peer set from
route-adjacent peers, recovery seeds, peer endpoints, and endpoint candidates.
When synthetic mesh testing is enabled, the node-agent probes warm peers with
`/mesh/v1/health` and reports metadata-only mesh-link observations. This is not
QUIC fabric live probes and reports metadata-only mesh-link observations. This is not
a persistent connection manager and does not forward service payloads.
C17Z6 adds advertised mesh endpoint reporting. When
@@ -602,7 +589,7 @@ persistent connection transport, STUN/TURN/ICE, NAT traversal, relay runtime,
or service payload forwarding.
C17Z11 adds the first real node-local peer connection manager for mesh
control-plane health. It uses a reusable HTTP keep-alive client to probe
control-plane health. It uses a reusable QUIC fabric transport to probe
direct/private/corporate peer endpoints selected by C17Z10 intents, updates
the shared peer connection tracker, and records `waiting_rendezvous` for
outbound-only or relay-required peers. Heartbeats include metadata-only
@@ -612,8 +599,8 @@ payload forwarding.
C17Z12 adds a node-scoped rendezvous/relay control-plane lease contract for
peers that would otherwise remain `waiting_rendezvous`. The agent consumes
`rendezvous_leases`, resolves matching intents into `relay_control`, probes the
relay node `/mesh/v1/health`, and records `relay_ready` for the peer control
`rendezvous_leases`, resolves matching intents into `relay_quic`, probes the
relay node over QUIC fabric live probe, and records `relay_ready` for the peer control
path. This remains control-plane health only and does not enable RDP/VPN/file/
video/service payload forwarding, arbitrary relay packet forwarding,
STUN/TURN/ICE, or host networking changes.
@@ -668,17 +655,17 @@ enable service payload forwarding.
C17Z21 defines the portable inbound listener contract for Docker, Linux
service, Windows service, and future OS-specific node packages. The node-agent
does not stop when the mesh listen port cannot be bound. It keeps the outbound
Control Plane session alive and emits `c17z21.mesh_listener_report.v1` in
Control Plane session alive and emits `c17z21.fabric_listener_report.v1` in
heartbeat metadata with configured address, effective address, listen mode,
listener status, inbound reachability, one-way connectivity, failure reason,
and port-conflict diagnostics.
`RAP_MESH_LISTEN_PORT_MODE` controls behavior:
`RAP_FABRIC_LISTEN_PORT_MODE` controls behavior:
- `manual`: bind exactly `RAP_MESH_LISTEN_ADDR`; on conflict report
- `manual`: bind exactly `RAP_FABRIC_LISTEN_ADDR`; on conflict report
`listen_failed` and wait for an operator/config change.
- `auto`: try `RAP_MESH_LISTEN_ADDR`; on conflict scan
`RAP_MESH_LISTEN_AUTO_PORT_START..RAP_MESH_LISTEN_AUTO_PORT_END` and report
- `auto`: try `RAP_FABRIC_LISTEN_ADDR`; on conflict scan
`RAP_FABRIC_LISTEN_AUTO_PORT_START..RAP_FABRIC_LISTEN_AUTO_PORT_END` and report
`auto_rebound` when a free port is selected.
- `disabled`: do not open an inbound listener; the node is expected to be
outbound-only, relay/rendezvous, or Control Plane only.
@@ -694,7 +681,7 @@ C17Z22 separates outbound Control Plane presence from inbound mesh
reachability. When synthetic mesh testing is enabled, every heartbeat includes
`c17z22.mesh_outbound_session_report.v1` with node-to-control-plane direction,
keepalive transport, listener conflict state, rendezvous/relay counters, and a
flag showing whether the current outbound session can be used as a reverse
`fabric_control_endpoint` plus a flag showing whether the current outbound session can be used as a reverse
control-channel contract. This is the portable basis for Docker, Linux service,
Windows service, and future packages where a node may be behind NAT or have no
stable inbound address. It is still control-plane telemetry only and does not
@@ -715,7 +702,7 @@ and is ranked ahead of auto-discovered addresses.
C17Z25 adds per-peer endpoint fallback probing to the control-plane mesh
manager. A node no longer treats the top-ranked endpoint candidate as the only
possible address for a peer. For each warm direct/private/corporate peer, the
manager probes the ranked candidate list until one `/mesh/v1/health` endpoint
manager probes the ranked candidate list until one QUIC fabric endpoint
responds or all direct candidates fail. Heartbeat metadata includes
`c17z25.mesh_peer_connection_manager_report.v1` with `probe_results`,
`selected_candidate_id`, `selected_endpoint`, and per-candidate success/failure
@@ -733,14 +720,14 @@ Scoped synthetic config shape:
"peer_directory_version": "peers-v1",
"policy_version": "policy-v1",
"peer_endpoints": {
"node-b": "http://127.0.0.1:19002"
"node-b": "quic://127.0.0.1:19002"
},
"peer_endpoint_candidates": {
"node-b": [
{
"endpoint_id": "node-b-public",
"node_id": "node-b",
"transport": "direct_tcp_tls",
"transport": "direct_quic",
"address": "203.0.113.20:443",
"reachability": "public",
"nat_type": "restricted",
@@ -784,3 +771,4 @@ Expected:
- Production forwarding remains disabled by default and limited to
`fabric.control` when explicitly enabled.
- No privileged operations are performed by the current agent.
@@ -1621,7 +1621,7 @@ func verdict(report loadtestReport) (string, []string) {
reasons = append(reasons, targetAckVerdictReasons(report)...)
reasons = append(reasons, routePressureDistributionVerdictReasons(report)...)
reasons = append(reasons, targetEndpointPolicyVerdictReasons(report)...)
reasons = append(reasons, legacyRouteModeVerdictReasons(report)...)
reasons = append(reasons, disallowedRouteModeVerdictReasons(report)...)
reasons = append(reasons, routeModeCoverageVerdictReasons(report)...)
if len(reasons) > 0 {
return "fail", reasons
@@ -1846,25 +1846,22 @@ func targetEndpointPolicyVerdictReasons(report loadtestReport) []string {
return []string{fmt.Sprintf("non_quic_targets=%s", strings.Join(invalid, ","))}
}
func legacyRouteModeVerdictReasons(report loadtestReport) []string {
func disallowedRouteModeVerdictReasons(report loadtestReport) []string {
if len(report.TargetStats) == 0 {
return nil
}
legacyModes := map[string]struct{}{
"relay": {},
"outbound_reverse": {},
"websocket": {},
"ws": {},
"wss": {},
"direct_http": {},
"direct_https": {},
"direct_tcp_tls": {},
supportedModes := map[string]struct{}{
string(mesh.FabricRouteDirect): {},
string(mesh.FabricRouteLAN): {},
string(mesh.FabricRouteICE): {},
string(mesh.FabricRouteReverse): {},
string(mesh.FabricRouteRelay): {},
}
found := map[string]int{}
for _, stats := range report.TargetStats {
for mode, count := range stats.RouteModes {
mode = strings.ToLower(strings.TrimSpace(mode))
if _, legacy := legacyModes[mode]; legacy && count > 0 {
if _, supported := supportedModes[mode]; !supported && count > 0 {
found[mode] += count
}
}
@@ -1877,7 +1874,7 @@ func legacyRouteModeVerdictReasons(report loadtestReport) []string {
modes = append(modes, fmt.Sprintf("%s:%d", mode, count))
}
sort.Strings(modes)
return []string{fmt.Sprintf("legacy_route_modes_observed=%s", strings.Join(modes, ","))}
return []string{fmt.Sprintf("compat_route_modes_observed=%s", strings.Join(modes, ","))}
}
func routeModeCoverageVerdictReasons(report loadtestReport) []string {
@@ -38,7 +38,7 @@ func TestRouteModeCoverageVerdictRequiresMixedModes(t *testing.T) {
}
}
func TestLegacyRouteModeVerdictRejectsNonQUICModes(t *testing.T) {
func TestDisallowedRouteModeVerdictRejectsNonQUICModes(t *testing.T) {
report := loadtestReport{
TargetStats: map[string]targetStats{
"a": {RouteModes: map[string]int{
@@ -49,12 +49,12 @@ func TestLegacyRouteModeVerdictRejectsNonQUICModes(t *testing.T) {
}},
},
}
reasons := legacyRouteModeVerdictReasons(report)
reasons := disallowedRouteModeVerdictReasons(report)
if len(reasons) != 1 ||
!strings.Contains(reasons[0], "relay:1") ||
!strings.Contains(reasons[0], "outbound_reverse:2") ||
!strings.Contains(reasons[0], "wss:3") {
t.Fatalf("reasons = %v, want legacy route mode failure", reasons)
t.Fatalf("reasons = %v, want compat route mode failure", reasons)
}
report.TargetStats["a"] = targetStats{RouteModes: map[string]int{
@@ -64,7 +64,7 @@ func TestLegacyRouteModeVerdictRejectsNonQUICModes(t *testing.T) {
string(mesh.FabricRouteReverse): 1,
string(mesh.FabricRouteRelay): 1,
}}
if reasons := legacyRouteModeVerdictReasons(report); len(reasons) != 0 {
if reasons := disallowedRouteModeVerdictReasons(report); len(reasons) != 0 {
t.Fatalf("reasons = %v, want QUIC modes accepted", reasons)
}
}
@@ -10,8 +10,6 @@ import (
"encoding/json"
"fmt"
"math/big"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"time"
@@ -22,22 +20,21 @@ import (
)
type smokeNode struct {
Local mesh.PeerIdentity
Runtime *mesh.SyntheticRuntime
URL string
server *httptest.Server
Local mesh.PeerIdentity
Runtime *mesh.SyntheticRuntime
Endpoint string
}
type smokeSyntheticTransport struct {
peers map[string]string
peers map[string]*mesh.SyntheticRuntime
}
func (t smokeSyntheticTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope mesh.SyntheticEnvelope) (mesh.SyntheticEnvelope, error) {
baseURL := t.peers[nextNodeID]
if baseURL == "" {
runtime := t.peers[nextNodeID]
if runtime == nil {
return mesh.SyntheticEnvelope{}, mesh.ErrSyntheticPeerUnavailable
}
return mesh.NewClient(baseURL).SendSynthetic(ctx, envelope)
return runtime.Receive(ctx, envelope)
}
type smokeReport struct {
@@ -104,8 +101,8 @@ func run(ctx context.Context) (smokeReport, error) {
relayRoute := smokeRoute("route-relay", []string{"node-a", "node-r", "node-b"})
routes := []mesh.SyntheticRoute{directRoute, relayRoute}
nodeAConfigPath, err := writeSmokeScopedConfig(nodeA.Local, map[string]string{
"node-r": nodeR.URL,
"node-b": nodeB.URL,
"node-r": nodeR.Endpoint,
"node-b": nodeB.Endpoint,
}, routes)
if err != nil {
return smokeReport{}, err
@@ -117,10 +114,19 @@ func run(ctx context.Context) (smokeReport, error) {
nodeA.Runtime = smokeRuntime(nodeA.Local, nodeAConfig.Routes, nodeAConfig.PeerEndpoints)
nodeR.Runtime = smokeRuntime(nodeR.Local, routes, map[string]string{
"node-b": nodeB.URL,
"node-b": nodeB.Endpoint,
})
nodeB.Runtime = smokeRuntime(nodeB.Local, routes, map[string]string{})
nodeA.Runtime = smokeRuntimeWithPeers(nodeA.Local, nodeAConfig.Routes, map[string]*mesh.SyntheticRuntime{
"node-r": nodeR.Runtime,
"node-b": nodeB.Runtime,
})
nodeR.Runtime = smokeRuntimeWithPeers(nodeR.Local, routes, map[string]*mesh.SyntheticRuntime{
"node-b": nodeB.Runtime,
})
nodeB.Runtime = smokeRuntimeWithPeers(nodeB.Local, routes, map[string]*mesh.SyntheticRuntime{})
directAck, err := nodeA.Runtime.SendProbe(ctx, directRoute.RouteID, mesh.SyntheticChannelFabricControl, "smoke-direct")
if err != nil {
return smokeReport{}, fmt.Errorf("direct probe: %w", err)
@@ -209,9 +215,9 @@ func run(ctx context.Context) (smokeReport, error) {
FabricSessionLatencyMS: fabricSessionLatency.Milliseconds(),
FabricSessionEndpoint: "quic://" + fabricQUICEndpoint,
PeerEndpoints: map[string]any{
"node-a": nodeA.URL,
"node-r": nodeR.URL,
"node-b": nodeB.URL,
"node-a": nodeA.Endpoint,
"node-r": nodeR.Endpoint,
"node-b": nodeB.Endpoint,
},
}, nil
}
@@ -472,21 +478,21 @@ func writeSmokeScopedConfig(local mesh.PeerIdentity, peers map[string]string, ro
}
func newSmokeNode(local mesh.PeerIdentity) *smokeNode {
node := &smokeNode{Local: local}
node.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
mesh.Server{Local: node.Local, SyntheticRuntime: node.Runtime}.Handler().ServeHTTP(w, r)
}))
node.URL = node.server.URL
return node
}
func (n *smokeNode) Close() {
if n.server != nil {
n.server.Close()
return &smokeNode{
Local: local,
Endpoint: "quic://smoke-" + local.NodeID,
}
}
func (n *smokeNode) Close() {
}
func smokeRuntime(local mesh.PeerIdentity, routes []mesh.SyntheticRoute, peers map[string]string) *mesh.SyntheticRuntime {
_ = peers
return smokeRuntimeWithPeers(local, routes, map[string]*mesh.SyntheticRuntime{})
}
func smokeRuntimeWithPeers(local mesh.PeerIdentity, routes []mesh.SyntheticRoute, peers map[string]*mesh.SyntheticRuntime) *mesh.SyntheticRuntime {
return mesh.NewSyntheticRuntime(mesh.SyntheticRuntimeConfig{
Enabled: true,
Local: local,
@@ -113,14 +113,13 @@ func applyStagedSelfUpdate() {
func runInstallLinux(ctx context.Context, args []string) error {
fs := flag.NewFlagSet("install-linux", flag.ContinueOnError)
cfg := hostagent.LinuxInstallConfig{}
var profileURL string
var installToken string
fs.StringVar(&cfg.RuntimeConfig.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
var joinBundle string
fs.StringVar(&cfg.RuntimeConfig.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&cfg.RuntimeConfig.ClusterAuthorityPublicKey, "cluster-authority-public-key", getenv("RAP_CLUSTER_AUTHORITY_PUBLIC_KEY", ""), "Pinned Ed25519 cluster authority public key for signed fabric registry records.")
fs.StringVar(&cfg.RuntimeConfig.FabricRegistryRecordsJSON, "fabric-registry-records-json", getenv("RAP_FABRIC_REGISTRY_RECORDS_JSON", ""), "JSON array of signed QUIC-only fabric registry records used as bootstrap seeds.")
fs.StringVar(&cfg.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID used by updater repair mode.")
fs.StringVar(&cfg.RuntimeConfig.JoinToken, "join-token", getenv("RAP_JOIN_TOKEN", ""), "One-time join token for first enrollment.")
fs.StringVar(&profileURL, "profile-url", getenv("RAP_INSTALL_PROFILE_URL", ""), "Control Plane API base URL or /node-agents/linux-install-profile URL for profile-based install.")
fs.StringVar(&installToken, "install-token", getenv("RAP_INSTALL_TOKEN", ""), "One-time install token used to fetch Linux install profile.")
fs.StringVar(&joinBundle, "join-bundle", getenv("RAP_JOIN_BUNDLE", ""), "Preferred local join bundle JSON with Linux install profile and QUIC fabric bootstrap seeds.")
fs.StringVar(&cfg.RuntimeConfig.NodeName, "node-name", getenv("RAP_NODE_NAME", ""), "Node display name.")
fs.StringVar(&cfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Node state directory.")
fs.StringVar(&cfg.InstallDir, "install-dir", getenv("RAP_LINUX_INSTALL_DIR", ""), "Directory for rap-node-agent and rap-host-agent.")
@@ -131,28 +130,31 @@ func runInstallLinux(ctx context.Context, args []string) error {
fs.BoolVar(&cfg.AutoUpdateEnabled, "auto-update-enabled", getenvBool("RAP_AUTO_UPDATE_ENABLED", true), "Install and start the Linux host-agent update service.")
fs.StringVar(&cfg.AutoUpdateCurrentVersion, "auto-update-current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version used by update-loop before the first successful update.")
fs.StringVar(&cfg.AutoUpdateChannel, "auto-update-channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override for update-loop.")
fs.IntVar(&cfg.AutoUpdateIntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
fs.IntVar(&cfg.AutoUpdateIntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", hostagent.DefaultUpdateIntervalSec), "Emergency rescue plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
fs.IntVar(&cfg.AutoUpdateInitialDelaySeconds, "auto-update-initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.")
fs.IntVar(&cfg.AutoUpdateHealthTimeoutSeconds, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated service health timeout in seconds.")
fs.StringVar(&cfg.HostAgentSourcePath, "host-agent-source-path", getenv("RAP_HOST_AGENT_SOURCE_PATH", ""), "Source rap-host-agent path copied to the persistent updater location.")
fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable historical synthetic mesh runtime.")
fs.BoolVar(&cfg.RuntimeConfig.FabricRuntimeEnabled, "fabric-runtime-enabled", getenvBool("RAP_FABRIC_RUNTIME_ENABLED", false), "Enable node-local synthetic fabric control runtime.")
fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
fs.BoolVar(&cfg.RuntimeConfig.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.")
fs.BoolVar(&cfg.RuntimeConfig.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.")
fs.StringVar(&cfg.RuntimeConfig.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.")
fs.IntVar(&cfg.RuntimeConfig.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.")
fs.IntVar(&cfg.RuntimeConfig.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 8), "VPN fabric-session stream shards per traffic class.")
fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getenvInt("RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.")
fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICIdleTTLSeconds, "vpn-fabric-quic-idle-ttl-seconds", getenvInt("RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300), "Idle TTL seconds for cached VPN QUIC carrier connections.")
fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ""), "Historical synthetic mesh HTTP listen address.")
fs.StringVar(&cfg.RuntimeConfig.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.RuntimeConfig.FabricListenAddr, "fabric-listen-addr", getenv("RAP_FABRIC_LISTEN_ADDR", ""), "Optional node listener address for QUIC fabric runtime.")
fs.StringVar(&cfg.RuntimeConfig.FabricListenPortMode, "fabric-listen-port-mode", getenv("RAP_FABRIC_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.RuntimeConfig.FabricListenAutoPortStart, "fabric-listen-auto-port-start", getenvInt("RAP_FABRIC_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.RuntimeConfig.FabricListenAutoPortEnd, "fabric-listen-auto-port-end", getenvInt("RAP_FABRIC_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "quic"), "Advertised transport.")
fs.StringVar(&cfg.RuntimeConfig.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", "outbound_only"), "Connectivity mode hint.")
fs.StringVar(&cfg.RuntimeConfig.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint.")
fs.StringVar(&cfg.RuntimeConfig.MeshSiteID, "mesh-site-id", getenv("RAP_MESH_SITE_ID", ""), "Physical/logical site identifier advertised with QUIC endpoints.")
fs.StringVar(&cfg.RuntimeConfig.MeshLocalityGroupID, "mesh-locality-group-id", getenv("RAP_MESH_LOCALITY_GROUP_ID", ""), "Private locality group identifier used for LAN/private endpoint selection.")
fs.StringVar(&cfg.RuntimeConfig.MeshNATGroupID, "mesh-nat-group-id", getenv("RAP_MESH_NAT_GROUP_ID", ""), "Shared NAT/ingress group identifier advertised with QUIC endpoints.")
fs.StringVar(&cfg.RuntimeConfig.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", "linux"), "Region/site hint.")
fs.IntVar(&cfg.RuntimeConfig.HeartbeatIntervalSeconds, "heartbeat-interval-seconds", getenvInt("RAP_HEARTBEAT_INTERVAL_SECONDS", 15), "Heartbeat interval seconds.")
fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollIntervalSeconds, "enrollment-poll-interval-seconds", getenvInt("RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5), "Enrollment poll interval seconds.")
@@ -160,7 +162,7 @@ func runInstallLinux(ctx context.Context, args []string) error {
if err := fs.Parse(args); err != nil {
return err
}
if strings.TrimSpace(profileURL) != "" || strings.TrimSpace(installToken) != "" {
if strings.TrimSpace(joinBundle) != "" {
dryRun := cfg.DryRun
startupMode := strings.TrimSpace(cfg.StartupMode)
autoUpdateEnabled := cfg.AutoUpdateEnabled
@@ -170,7 +172,7 @@ func runInstallLinux(ctx context.Context, args []string) error {
autoUpdateInitialDelaySeconds := cfg.AutoUpdateInitialDelaySeconds
autoUpdateHealthTimeoutSeconds := cfg.AutoUpdateHealthTimeoutSeconds
hostAgentSourcePath := cfg.HostAgentSourcePath
profile, err := hostagent.FetchLinuxInstallProfile(ctx, hostagent.ProfileRequest{URL: profileURL, ClusterID: cfg.RuntimeConfig.ClusterID, InstallToken: installToken, NodeName: cfg.RuntimeConfig.NodeName})
profile, err := hostagent.LoadLinuxJoinBundle(joinBundle)
if err != nil {
return err
}
@@ -201,14 +203,13 @@ func runInstallLinux(ctx context.Context, args []string) error {
func runInstallWindows(ctx context.Context, args []string) error {
fs := flag.NewFlagSet("install-windows", flag.ContinueOnError)
cfg := hostagent.WindowsInstallConfig{}
var profileURL string
var installToken string
fs.StringVar(&cfg.RuntimeConfig.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
var joinBundle string
fs.StringVar(&cfg.RuntimeConfig.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&cfg.RuntimeConfig.ClusterAuthorityPublicKey, "cluster-authority-public-key", getenv("RAP_CLUSTER_AUTHORITY_PUBLIC_KEY", ""), "Pinned Ed25519 cluster authority public key for signed fabric registry records.")
fs.StringVar(&cfg.RuntimeConfig.FabricRegistryRecordsJSON, "fabric-registry-records-json", getenv("RAP_FABRIC_REGISTRY_RECORDS_JSON", ""), "JSON array of signed QUIC-only fabric registry records used as bootstrap seeds.")
fs.StringVar(&cfg.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID used by updater repair mode.")
fs.StringVar(&cfg.RuntimeConfig.JoinToken, "join-token", getenv("RAP_JOIN_TOKEN", ""), "One-time join token for first enrollment.")
fs.StringVar(&profileURL, "profile-url", getenv("RAP_INSTALL_PROFILE_URL", ""), "Control Plane API base URL or /node-agents/windows-install-profile URL for profile-based install.")
fs.StringVar(&installToken, "install-token", getenv("RAP_INSTALL_TOKEN", ""), "One-time install token used to fetch Windows install profile.")
fs.StringVar(&joinBundle, "join-bundle", getenv("RAP_JOIN_BUNDLE", ""), "Preferred local join bundle JSON with Windows install profile and QUIC fabric bootstrap seeds.")
fs.StringVar(&cfg.RuntimeConfig.NodeName, "node-name", getenv("RAP_NODE_NAME", ""), "Node display name.")
fs.StringVar(&cfg.RuntimeConfig.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Node state directory.")
fs.StringVar(&cfg.InstallDir, "install-dir", getenv("RAP_WINDOWS_INSTALL_DIR", ""), "Directory for rap-node-agent.exe and wrapper scripts.")
@@ -218,28 +219,31 @@ func runInstallWindows(ctx context.Context, args []string) error {
fs.BoolVar(&cfg.AutoUpdateEnabled, "auto-update-enabled", getenvBool("RAP_AUTO_UPDATE_ENABLED", true), "Install and start the Windows host-agent update task.")
fs.StringVar(&cfg.AutoUpdateCurrentVersion, "auto-update-current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version used by update-loop before the first successful update.")
fs.StringVar(&cfg.AutoUpdateChannel, "auto-update-channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override for update-loop.")
fs.IntVar(&cfg.AutoUpdateIntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
fs.IntVar(&cfg.AutoUpdateIntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", hostagent.DefaultUpdateIntervalSec), "Emergency rescue plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
fs.IntVar(&cfg.AutoUpdateInitialDelaySeconds, "auto-update-initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.")
fs.IntVar(&cfg.AutoUpdateHealthTimeoutSeconds, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated service health timeout in seconds.")
fs.StringVar(&cfg.HostAgentSourcePath, "host-agent-source-path", getenv("RAP_HOST_AGENT_SOURCE_PATH", ""), "Source rap-host-agent.exe path copied to the persistent updater location.")
fs.BoolVar(&cfg.RuntimeConfig.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
fs.BoolVar(&cfg.RuntimeConfig.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable historical synthetic mesh runtime.")
fs.BoolVar(&cfg.RuntimeConfig.FabricRuntimeEnabled, "fabric-runtime-enabled", getenvBool("RAP_FABRIC_RUNTIME_ENABLED", false), "Enable node-local synthetic fabric control runtime.")
fs.BoolVar(&cfg.RuntimeConfig.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
fs.BoolVar(&cfg.RuntimeConfig.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.")
fs.BoolVar(&cfg.RuntimeConfig.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.")
fs.StringVar(&cfg.RuntimeConfig.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.")
fs.IntVar(&cfg.RuntimeConfig.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.")
fs.IntVar(&cfg.RuntimeConfig.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 8), "VPN fabric-session stream shards per traffic class.")
fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getenvInt("RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.")
fs.IntVar(&cfg.RuntimeConfig.VPNFabricQUICIdleTTLSeconds, "vpn-fabric-quic-idle-ttl-seconds", getenvInt("RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300), "Idle TTL seconds for cached VPN QUIC carrier connections.")
fs.StringVar(&cfg.RuntimeConfig.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ""), "Historical synthetic mesh HTTP listen address.")
fs.StringVar(&cfg.RuntimeConfig.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.RuntimeConfig.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.RuntimeConfig.FabricListenAddr, "fabric-listen-addr", getenv("RAP_FABRIC_LISTEN_ADDR", ""), "Optional node listener address for QUIC fabric runtime.")
fs.StringVar(&cfg.RuntimeConfig.FabricListenPortMode, "fabric-listen-port-mode", getenv("RAP_FABRIC_LISTEN_PORT_MODE", "auto"), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.RuntimeConfig.FabricListenAutoPortStart, "fabric-listen-auto-port-start", getenvInt("RAP_FABRIC_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.RuntimeConfig.FabricListenAutoPortEnd, "fabric-listen-auto-port-end", getenvInt("RAP_FABRIC_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.")
fs.StringVar(&cfg.RuntimeConfig.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "quic"), "Advertised transport.")
fs.StringVar(&cfg.RuntimeConfig.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", "outbound_only"), "Connectivity mode hint.")
fs.StringVar(&cfg.RuntimeConfig.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint.")
fs.StringVar(&cfg.RuntimeConfig.MeshSiteID, "mesh-site-id", getenv("RAP_MESH_SITE_ID", ""), "Physical/logical site identifier advertised with QUIC endpoints.")
fs.StringVar(&cfg.RuntimeConfig.MeshLocalityGroupID, "mesh-locality-group-id", getenv("RAP_MESH_LOCALITY_GROUP_ID", ""), "Private locality group identifier used for LAN/private endpoint selection.")
fs.StringVar(&cfg.RuntimeConfig.MeshNATGroupID, "mesh-nat-group-id", getenv("RAP_MESH_NAT_GROUP_ID", ""), "Shared NAT/ingress group identifier advertised with QUIC endpoints.")
fs.StringVar(&cfg.RuntimeConfig.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", "windows"), "Region/site hint.")
fs.IntVar(&cfg.RuntimeConfig.HeartbeatIntervalSeconds, "heartbeat-interval-seconds", getenvInt("RAP_HEARTBEAT_INTERVAL_SECONDS", 15), "Heartbeat interval seconds.")
fs.IntVar(&cfg.RuntimeConfig.EnrollmentPollIntervalSeconds, "enrollment-poll-interval-seconds", getenvInt("RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5), "Enrollment poll interval seconds.")
@@ -247,7 +251,7 @@ func runInstallWindows(ctx context.Context, args []string) error {
if err := fs.Parse(args); err != nil {
return err
}
if strings.TrimSpace(profileURL) != "" || strings.TrimSpace(installToken) != "" {
if strings.TrimSpace(joinBundle) != "" {
dryRun := cfg.DryRun
startupMode := strings.TrimSpace(cfg.StartupMode)
autoUpdateEnabled := cfg.AutoUpdateEnabled
@@ -257,12 +261,7 @@ func runInstallWindows(ctx context.Context, args []string) error {
autoUpdateInitialDelaySeconds := cfg.AutoUpdateInitialDelaySeconds
autoUpdateHealthTimeoutSeconds := cfg.AutoUpdateHealthTimeoutSeconds
hostAgentSourcePath := cfg.HostAgentSourcePath
profile, err := hostagent.FetchWindowsInstallProfile(ctx, hostagent.ProfileRequest{
URL: profileURL,
ClusterID: cfg.RuntimeConfig.ClusterID,
InstallToken: installToken,
NodeName: cfg.RuntimeConfig.NodeName,
})
profile, err := hostagent.LoadWindowsJoinBundle(joinBundle)
if err != nil {
return err
}
@@ -364,7 +363,7 @@ func runUpdate(ctx context.Context, args []string) error {
}
fmt.Printf("action=%s reason=%s target=%s production_forwarding=%t\n", plan.Action, plan.Reason, plan.TargetVersion, plan.ProductionForwarding)
if plan.Artifact != nil {
fmt.Printf("artifact=%s sha256=%s size=%d\n", plan.Artifact.URL, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
fmt.Printf("artifact_id=%s sha256=%s size=%d transport=quic_fabric\n", plan.Artifact.ID, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
}
return nil
}
@@ -407,7 +406,7 @@ func runUpdateLoop(ctx context.Context, args []string) error {
var hostAgentVersion string
var hostAgentBinaryPath string
registerUpdateFlags(fs, &req, &healthTimeoutSeconds)
fs.IntVar(&intervalSeconds, "interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Seconds between emergency fallback update plan polls. Update-service/heartbeat hints trigger normal runs.")
fs.IntVar(&intervalSeconds, "interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", hostagent.DefaultUpdateIntervalSec), "Seconds between emergency rescue update plan polls. Update-service/heartbeat hints trigger normal runs.")
fs.IntVar(&initialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 0), "Seconds to wait before the first poll.")
fs.Float64Var(&jitter, "jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Fractional random jitter for interval and initial delay, 0..1.")
fs.IntVar(&maxRuns, "max-runs", getenvInt("RAP_UPDATE_MAX_RUNS", 0), "Maximum loop iterations. Use 0 to run until stopped.")
@@ -432,7 +431,6 @@ func runUpdateLoop(ctx context.Context, args []string) error {
}
cfg.HostAgentUpdateEnabled = hostAgentStatusEnabled
cfg.HostAgentUpdateRequest = hostagent.HostAgentUpdateRequest{
BackendURL: req.BackendURL,
ClusterID: req.ClusterID,
NodeID: req.NodeID,
StateDir: req.StateDir,
@@ -487,7 +485,6 @@ func parseMonitor(args []string) (hostagent.MonitorConfig, error) {
var staleRestartingSeconds int
var tmpMinAgeMinutes int
watchContainers := repeatedFlag{}
fs.StringVar(&cfg.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL used for monitor status reports.")
fs.StringVar(&cfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&cfg.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.")
fs.StringVar(&cfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", hostagent.DefaultStateDir), "Host path containing node-agent identity.json.")
@@ -545,13 +542,12 @@ func runInstallUpdater(ctx context.Context, args []string) error {
var selfUpdater bool
var monitorEnabled bool
monitorContainers := repeatedFlag{}
fs.StringVar(&runtimeCfg.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
fs.StringVar(&runtimeCfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&runtimeCfg.ContainerName, "container-name", getenv("RAP_NODE_AGENT_CONTAINER", hostagent.DefaultContainerName), "Docker container name to update.")
fs.StringVar(&runtimeCfg.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", hostagent.DefaultStateDir), "Host path containing node-agent identity.json.")
fs.StringVar(&service.CurrentVersion, "current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version before first successful update.")
fs.StringVar(&service.Channel, "channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override.")
fs.IntVar(&service.IntervalSeconds, "interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
fs.IntVar(&service.IntervalSeconds, "interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", hostagent.DefaultUpdateIntervalSec), "Emergency rescue plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
fs.IntVar(&service.InitialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.")
fs.Float64Var(&service.Jitter, "jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Update-loop interval jitter, 0..1.")
fs.IntVar(&service.HealthTimeoutSec, "health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated container running-state timeout in seconds.")
@@ -637,7 +633,6 @@ func parseHostAgentUpdate(args []string) (hostagent.HostAgentUpdateRequest, int,
var maxRuns int
var jitter float64
var stopOnError bool
fs.StringVar(&req.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
fs.StringVar(&req.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&req.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.")
fs.StringVar(&req.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Host path containing node-agent identity.json.")
@@ -651,7 +646,7 @@ func parseHostAgentUpdate(args []string) (hostagent.HostAgentUpdateRequest, int,
fs.StringVar(&req.InstallType, "install-type", getenv("RAP_HOST_AGENT_UPDATE_INSTALL_TYPE", hostagent.BinaryUpdateInstallType), "Host-agent artifact install type.")
fs.StringVar(&req.BinaryPath, "binary-path", getenv("RAP_HOST_AGENT_BINARY_PATH", hostagent.DefaultHostAgentInstallPath), "rap-host-agent binary path to replace atomically.")
fs.BoolVar(&req.DryRun, "dry-run", false, "Fetch and print the update plan without applying it.")
fs.IntVar(&intervalSeconds, "interval-seconds", getenvInt("RAP_HOST_AGENT_UPDATE_INTERVAL_SECONDS", 900), "Seconds between host-agent update plan polls.")
fs.IntVar(&intervalSeconds, "interval-seconds", getenvInt("RAP_HOST_AGENT_UPDATE_INTERVAL_SECONDS", hostagent.DefaultUpdateIntervalSec), "Seconds between host-agent update plan polls.")
fs.IntVar(&initialDelaySeconds, "initial-delay-seconds", getenvInt("RAP_HOST_AGENT_UPDATE_INITIAL_DELAY_SECONDS", 45), "Seconds to wait before the first poll.")
fs.Float64Var(&jitter, "jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Fractional random jitter for interval and initial delay, 0..1.")
fs.IntVar(&maxRuns, "max-runs", getenvInt("RAP_UPDATE_MAX_RUNS", 0), "Maximum loop iterations. Use 0 to run until stopped.")
@@ -663,7 +658,6 @@ func parseHostAgentUpdate(args []string) (hostagent.HostAgentUpdateRequest, int,
}
func registerUpdateFlags(fs *flag.FlagSet, req *hostagent.UpdateRequest, healthTimeoutSeconds *int) {
fs.StringVar(&req.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
fs.StringVar(&req.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&req.NodeID, "node-id", getenv("RAP_NODE_ID", ""), "Already enrolled node ID.")
fs.StringVar(&req.StateDir, "state-dir", getenv("RAP_NODE_STATE_DIR", ""), "Host path containing node-agent identity.json; used when node-id is not known yet.")
@@ -688,16 +682,15 @@ func parseInstall(args []string) (installCommandConfig, error) {
fs := flag.NewFlagSet("install", flag.ContinueOnError)
cfg := hostagent.RuntimeConfig{}
var dryRun bool
var profileURL string
var installToken string
var joinBundle string
var autoUpdateEnabled bool
autoUpdate := hostagent.UpdateServiceConfig{}
monitorContainers := repeatedFlag{}
fs.StringVar(&cfg.BackendURL, "backend-url", getenv("RAP_BACKEND_URL", ""), "Control Plane API base URL.")
fs.StringVar(&cfg.ClusterID, "cluster-id", getenv("RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&cfg.ClusterAuthorityPublicKey, "cluster-authority-public-key", getenv("RAP_CLUSTER_AUTHORITY_PUBLIC_KEY", ""), "Pinned Ed25519 cluster authority public key for signed fabric registry records.")
fs.StringVar(&cfg.FabricRegistryRecordsJSON, "fabric-registry-records-json", getenv("RAP_FABRIC_REGISTRY_RECORDS_JSON", ""), "JSON array of signed QUIC-only fabric registry records used as bootstrap seeds.")
fs.StringVar(&cfg.JoinToken, "join-token", getenv("RAP_JOIN_TOKEN", ""), "One-time join token for first enrollment.")
fs.StringVar(&profileURL, "profile-url", getenv("RAP_INSTALL_PROFILE_URL", ""), "Control Plane API base URL or /node-agents/docker-install-profile URL for profile-based install.")
fs.StringVar(&installToken, "install-token", getenv("RAP_INSTALL_TOKEN", ""), "One-time install token used to fetch Docker install profile.")
fs.StringVar(&joinBundle, "join-bundle", getenv("RAP_JOIN_BUNDLE", ""), "Preferred local join bundle JSON with Docker install profile and QUIC fabric bootstrap seeds.")
fs.StringVar(&cfg.NodeName, "node-name", getenv("RAP_NODE_NAME", ""), "Node display name.")
fs.StringVar(&cfg.Image, "image", getenv("RAP_NODE_AGENT_IMAGE", hostagent.DefaultImage), "Docker image for rap-node-agent.")
fs.StringVar(&cfg.ContainerName, "container-name", getenv("RAP_NODE_AGENT_CONTAINER", hostagent.DefaultContainerName), "Docker container name.")
@@ -716,7 +709,7 @@ func parseInstall(args []string) (installCommandConfig, error) {
fs.StringVar(&autoUpdate.CurrentVersion, "auto-update-current-version", getenv("RAP_NODE_AGENT_VERSION", agent.Version), "Initial node-agent version used by update-loop before the first successful update.")
fs.StringVar(&autoUpdate.SelfUpdateVersion, "host-agent-current-version", getenv("RAP_HOST_AGENT_VERSION", agent.Version), "Initial host-agent binary version used by the self-updater.")
fs.StringVar(&autoUpdate.Channel, "auto-update-channel", getenv("RAP_UPDATE_CHANNEL", ""), "Optional update channel override for update-loop.")
fs.IntVar(&autoUpdate.IntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", 21600), "Emergency fallback plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
fs.IntVar(&autoUpdate.IntervalSeconds, "auto-update-interval-seconds", getenvInt("RAP_UPDATE_INTERVAL_SECONDS", hostagent.DefaultUpdateIntervalSec), "Emergency rescue plan poll interval in seconds. Update-service/heartbeat hints trigger normal runs.")
fs.IntVar(&autoUpdate.InitialDelaySeconds, "auto-update-initial-delay-seconds", getenvInt("RAP_UPDATE_INITIAL_DELAY_SECONDS", 15), "Update-loop initial delay in seconds.")
fs.Float64Var(&autoUpdate.Jitter, "auto-update-jitter", getenvFloat("RAP_UPDATE_JITTER", 0.15), "Update-loop interval jitter, 0..1.")
fs.IntVar(&autoUpdate.HealthTimeoutSec, "auto-update-health-timeout-seconds", getenvInt("RAP_UPDATE_HEALTH_TIMEOUT_SECONDS", 30), "Updated container running-state timeout in seconds.")
@@ -728,23 +721,26 @@ func parseInstall(args []string) (installCommandConfig, error) {
fs.IntVar(&autoUpdate.MonitorDiskCritical, "monitor-disk-critical-percent", getenvInt("RAP_MONITOR_DISK_CRITICAL_PERCENT", hostagent.DefaultMonitorDiskCriticalPercent), "Disk used percent that reports failure after cleanup.")
fs.BoolVar(&autoUpdate.MonitorCleanupDocker, "monitor-cleanup-docker", getenvBool("RAP_MONITOR_CLEANUP_DOCKER", true), "Run safe docker prune cleanup when disk is above cleanup threshold.")
fs.BoolVar(&cfg.WorkloadSupervisionEnabled, "workload-supervision-enabled", getenvBool("RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable node-agent workload status reporting.")
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getenvBool("RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable historical synthetic mesh runtime.")
fs.BoolVar(&cfg.FabricRuntimeEnabled, "fabric-runtime-enabled", getenvBool("RAP_FABRIC_RUNTIME_ENABLED", false), "Enable node-local synthetic fabric control runtime.")
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getenvBool("RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production forwarding gate; runtime still fail-closed if unavailable.")
fs.BoolVar(&cfg.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getenvBool("RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric sessions.")
fs.BoolVar(&cfg.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getenvBool("RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener.")
fs.StringVar(&cfg.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getenv("RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "QUIC/UDP fabric listen address.")
fs.IntVar(&cfg.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.")
fs.IntVar(&cfg.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getenvInt("RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 8), "VPN fabric-session stream shards per traffic class.")
fs.IntVar(&cfg.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getenvInt("RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.")
fs.IntVar(&cfg.VPNFabricQUICIdleTTLSeconds, "vpn-fabric-quic-idle-ttl-seconds", getenvInt("RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300), "Idle TTL seconds for cached VPN QUIC carrier connections.")
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getenv("RAP_MESH_LISTEN_ADDR", ""), "Historical synthetic mesh HTTP listen address inside container.")
fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getenv("RAP_MESH_LISTEN_PORT_MODE", ""), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_START", 0), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getenvInt("RAP_MESH_LISTEN_AUTO_PORT_END", 0), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.FabricListenAddr, "fabric-listen-addr", getenv("RAP_FABRIC_LISTEN_ADDR", ""), "Optional node listener address for QUIC fabric runtime inside container.")
fs.StringVar(&cfg.FabricListenPortMode, "fabric-listen-port-mode", getenv("RAP_FABRIC_LISTEN_PORT_MODE", ""), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.FabricListenAutoPortStart, "fabric-listen-auto-port-start", getenvInt("RAP_FABRIC_LISTEN_AUTO_PORT_START", 0), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.FabricListenAutoPortEnd, "fabric-listen-auto-port-end", getenvInt("RAP_FABRIC_LISTEN_AUTO_PORT_END", 0), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getenv("RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint.")
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getenv("RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "Advertised endpoint candidates JSON.")
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getenv("RAP_MESH_ADVERTISE_TRANSPORT", "quic"), "Advertised transport.")
fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getenv("RAP_MESH_CONNECTIVITY_MODE", ""), "Connectivity mode hint.")
fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getenv("RAP_MESH_NAT_TYPE", ""), "NAT type hint.")
fs.StringVar(&cfg.MeshSiteID, "mesh-site-id", getenv("RAP_MESH_SITE_ID", ""), "Physical/logical site identifier advertised with QUIC endpoints.")
fs.StringVar(&cfg.MeshLocalityGroupID, "mesh-locality-group-id", getenv("RAP_MESH_LOCALITY_GROUP_ID", ""), "Private locality group identifier used for LAN/private endpoint selection.")
fs.StringVar(&cfg.MeshNATGroupID, "mesh-nat-group-id", getenv("RAP_MESH_NAT_GROUP_ID", ""), "Shared NAT/ingress group identifier advertised with QUIC endpoints.")
fs.StringVar(&cfg.MeshRegion, "mesh-region", getenv("RAP_MESH_REGION", ""), "Region/site hint.")
fs.IntVar(&cfg.HeartbeatIntervalSeconds, "heartbeat-interval-seconds", getenvInt("RAP_HEARTBEAT_INTERVAL_SECONDS", 15), "Heartbeat interval seconds.")
fs.IntVar(&cfg.EnrollmentPollIntervalSeconds, "enrollment-poll-interval-seconds", getenvInt("RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5), "Enrollment poll interval seconds.")
@@ -752,25 +748,20 @@ func parseInstall(args []string) (installCommandConfig, error) {
fs.IntVar(&cfg.ProductionObservationSinkCap, "production-observation-sink-capacity", getenvInt("RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Production observation sink capacity.")
extraEnv := repeatedFlag{}
extraRunArg := repeatedFlag{}
imageArtifactURL := repeatedFlag{}
imageArtifactPath := repeatedFlag{}
fs.Var(&extraEnv, "env", "Extra KEY=VALUE env passed to node-agent container; may be repeated.")
fs.Var(&extraRunArg, "docker-run-arg", "Extra raw docker run argument; may be repeated.")
fs.Var(&imageArtifactURL, "image-artifact-url", "Docker image tar artifact URL to docker load before running; may be repeated.")
fs.Var(&imageArtifactPath, "image-artifact-path", "Local Docker image tar artifact path to docker load before running; may be repeated.")
fs.Var(&monitorContainers, "monitor-container", "Extra Docker container watched by monitor; may be repeated.")
if err := fs.Parse(args); err != nil {
return installCommandConfig{}, err
}
cfg.ExtraEnv = extraEnv
cfg.AdditionalDockerRunArgs = extraRunArg
cfg.ImageArtifactURLs = append(cfg.ImageArtifactURLs, imageArtifactURL...)
cfg.ImageArtifactURLs = append(cfg.ImageArtifactURLs, imageArtifactPath...)
autoUpdate.MonitorContainers = monitorContainers
if strings.TrimSpace(profileURL) != "" || strings.TrimSpace(installToken) != "" {
profile, err := hostagent.FetchDockerInstallProfile(context.Background(), hostagent.ProfileRequest{
URL: profileURL,
ClusterID: cfg.ClusterID,
InstallToken: installToken,
NodeName: cfg.NodeName,
})
if strings.TrimSpace(joinBundle) != "" {
profile, err := hostagent.LoadDockerJoinBundle(joinBundle)
if err != nil {
return installCommandConfig{}, err
}
@@ -778,8 +769,8 @@ func parseInstall(args []string) (installCommandConfig, error) {
profileCfg.ExtraEnv = cfg.ExtraEnv
profileCfg.AdditionalDockerRunArgs = cfg.AdditionalDockerRunArgs
profileCfg.DockerVPNGatewayEnabled = profileCfg.DockerVPNGatewayEnabled || cfg.DockerVPNGatewayEnabled
if len(imageArtifactURL) > 0 {
profileCfg.ImageArtifactURLs = append([]string(nil), imageArtifactURL...)
if len(imageArtifactPath) > 0 {
profileCfg.ImageArtifactURLs = append([]string(nil), imageArtifactPath...)
}
if cfg.ImageArtifactSHA256 != "" {
profileCfg.ImageArtifactSHA256 = cfg.ImageArtifactSHA256
@@ -867,16 +858,15 @@ func shellJoin(args []string) string {
func usage() {
fmt.Fprintln(os.Stderr, `usage:
rap-host-agent install -profile-url URL -install-token TOKEN [-node-name NAME] [docker options]
rap-host-agent install -backend-url URL -cluster-id ID -join-token TOKEN -node-name NAME [docker options]
rap-host-agent install-windows -profile-url URL -install-token TOKEN [-node-name NAME] [windows options]
rap-host-agent install-linux -profile-url URL -install-token TOKEN [-node-name NAME] [linux/systemd options]
rap-host-agent install-updater (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -state-dir DIR -container-name NAME
rap-host-agent update-host-agent (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -state-dir DIR
rap-host-agent update-host-agent-loop (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -state-dir DIR
rap-host-agent monitor-loop (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -state-dir DIR --watch-container NAME
rap-host-agent monitor-once (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -state-dir DIR --watch-container NAME
rap-host-agent update (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -node-id ID [-container-name NAME]
rap-host-agent update-loop (-backend-url URL | -fabric-registry-records-json JSON) -cluster-id ID -node-id ID [-container-name NAME]
rap-host-agent install -join-bundle FILE [docker options]
rap-host-agent install-windows -join-bundle FILE [windows options]
rap-host-agent install-linux -join-bundle FILE [linux/systemd options]
rap-host-agent install-updater -fabric-registry-records-json JSON -cluster-id ID -state-dir DIR -container-name NAME
rap-host-agent update-host-agent -fabric-registry-records-json JSON -cluster-id ID -state-dir DIR
rap-host-agent update-host-agent-loop -fabric-registry-records-json JSON -cluster-id ID -state-dir DIR
rap-host-agent monitor-loop -fabric-registry-records-json JSON -cluster-id ID -state-dir DIR --watch-container NAME
rap-host-agent monitor-once -fabric-registry-records-json JSON -cluster-id ID -state-dir DIR --watch-container NAME
rap-host-agent update -fabric-registry-records-json JSON -cluster-id ID -node-id ID [-container-name NAME]
rap-host-agent update-loop -fabric-registry-records-json JSON -cluster-id ID -node-id ID [-container-name NAME]
rap-host-agent status [-container-name NAME]`)
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -7,7 +7,7 @@ import (
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
const Version = "0.2.321-directreadytarget"
const Version = "0.2.372-vpn-opaque-channel"
func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) client.EnrollRequest {
return client.EnrollRequest{
+29 -367
View File
@@ -1,22 +1,11 @@
package client
import (
"bytes"
"context"
"encoding/binary"
"encoding/json"
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"
)
type Client struct {
baseURL string
httpClient *http.Client
}
type Client struct{}
type RawControlRequest struct {
Method string `json:"method"`
@@ -45,19 +34,19 @@ type EnrollResponse struct {
JoinRequest json.RawMessage `json:"join_request"`
}
type EnrollmentBootstrapRequest struct {
type EnrollmentJoinRequest struct {
ClusterID string `json:"cluster_id"`
NodeFingerprint string `json:"node_fingerprint"`
PublicKey string `json:"public_key"`
}
type EnrollmentBootstrapResponse struct {
Status string `json:"status"`
JoinRequest json.RawMessage `json:"join_request"`
Bootstrap *NodeBootstrap `json:"node_bootstrap,omitempty"`
type EnrollmentJoinResponse struct {
Status string `json:"status"`
JoinRequest json.RawMessage `json:"join_request"`
JoinContract *NodeJoinContract `json:"node_join,omitempty"`
}
type NodeBootstrap struct {
type NodeJoinContract struct {
NodeID string `json:"node_id"`
ClusterID string `json:"cluster_id"`
IdentityStatus string `json:"identity_status"`
@@ -84,15 +73,19 @@ type HeartbeatResponse struct {
}
type NodeUpdateHint struct {
SchemaVersion string `json:"schema_version"`
Generation string `json:"generation,omitempty"`
CheckNow bool `json:"check_now"`
Products []string `json:"products,omitempty"`
Reason string `json:"reason,omitempty"`
DeliveryMode string `json:"delivery_mode,omitempty"`
SubscriptionStatus string `json:"subscription_status,omitempty"`
UpdateService *NodeUpdateServiceAssignment `json:"update_service,omitempty"`
FallbackPollSeconds int `json:"fallback_poll_seconds,omitempty"`
SchemaVersion string `json:"schema_version"`
Generation string `json:"generation,omitempty"`
CheckNow bool `json:"check_now"`
Products []string `json:"products,omitempty"`
TargetVersions map[string]string `json:"target_versions,omitempty"`
Reason string `json:"reason,omitempty"`
DeliveryMode string `json:"delivery_mode,omitempty"`
SubscriptionStatus string `json:"subscription_status,omitempty"`
UpdateService *NodeUpdateServiceAssignment `json:"update_service,omitempty"`
UpdateServiceCandidates []NodeUpdateServiceAssignment `json:"update_service_candidates,omitempty"`
RescuePollSeconds int `json:"rescue_poll_seconds,omitempty"`
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"`
}
type NodeUpdateServiceAssignment struct {
@@ -207,6 +200,13 @@ type NodeVPNAssignmentLease struct {
}
type NodeVPNAssignment struct {
TunnelID string `json:"tunnel_id,omitempty"`
PoolID string `json:"pool_id,omitempty"`
ServiceID string `json:"service_id,omitempty"`
LocalServiceID string `json:"local_service_id,omitempty"`
RemoteServiceID string `json:"remote_service_id,omitempty"`
ServiceKind string `json:"service_kind,omitempty"`
ServiceClass string `json:"service_class,omitempty"`
VPNConnectionID string `json:"vpn_connection_id"`
ClusterID string `json:"cluster_id"`
OrganizationID string `json:"organization_id"`
@@ -624,6 +624,7 @@ type EndpointCandidateHealthObservation struct {
EndpointID string `json:"endpoint_id"`
Source string `json:"source,omitempty"`
ReporterNodeID string `json:"reporter_node_id,omitempty"`
ReporterRegion string `json:"reporter_region,omitempty"`
LastLatencyMs int64 `json:"last_latency_ms,omitempty"`
SuccessCount uint64 `json:"success_count,omitempty"`
FailureCount uint64 `json:"failure_count,omitempty"`
@@ -632,343 +633,4 @@ type EndpointCandidateHealthObservation struct {
ObservedAt time.Time `json:"observed_at,omitempty"`
}
func New(baseURL string) *Client {
return &Client{
baseURL: baseURL,
httpClient: &http.Client{
Timeout: 15 * time.Second,
},
}
}
func (c *Client) Enroll(ctx context.Context, request EnrollRequest) (EnrollResponse, error) {
var response EnrollResponse
if err := c.postJSON(ctx, "/node-agents/enroll", request, &response); err != nil {
return EnrollResponse{}, err
}
return response, nil
}
func (c *Client) BootstrapEnrollment(ctx context.Context, joinRequestID string, request EnrollmentBootstrapRequest) (EnrollmentBootstrapResponse, error) {
var response EnrollmentBootstrapResponse
path := fmt.Sprintf("/node-agents/enrollments/%s/bootstrap", joinRequestID)
if err := c.postJSON(ctx, path, request, &response); err != nil {
return EnrollmentBootstrapResponse{}, err
}
return response, nil
}
func (c *Client) Heartbeat(ctx context.Context, clusterID, nodeID string, request HeartbeatRequest) (HeartbeatResponse, error) {
var response HeartbeatResponse
path := fmt.Sprintf("/clusters/%s/nodes/%s/heartbeats", clusterID, nodeID)
if err := c.postJSON(ctx, path, request, &response); err != nil {
return HeartbeatResponse{}, err
}
return response, nil
}
func (c *Client) NodeUpdatePlan(ctx context.Context, clusterID, nodeID string, request NodeUpdatePlanRequest) (NodeUpdatePlan, error) {
values := url.Values{}
values.Set("product", request.Product)
values.Set("current_version", request.CurrentVersion)
values.Set("os", request.OS)
values.Set("arch", request.Arch)
values.Set("install_type", request.InstallType)
if request.Channel != "" {
values.Set("channel", request.Channel)
}
var response NodeUpdatePlanResponse
path := fmt.Sprintf("/clusters/%s/nodes/%s/updates/plan?%s", clusterID, nodeID, values.Encode())
if err := c.getJSON(ctx, path, &response); err != nil {
return NodeUpdatePlan{}, err
}
return response.Plan, nil
}
func (c *Client) ReportNodeUpdateStatus(ctx context.Context, clusterID, nodeID string, request NodeUpdateStatusRequest) error {
path := fmt.Sprintf("/clusters/%s/nodes/%s/updates/status", clusterID, nodeID)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) DesiredWorkloads(ctx context.Context, clusterID, nodeID string) ([]DesiredWorkload, error) {
var response struct {
DesiredWorkloads []DesiredWorkload `json:"desired_workloads"`
}
path := fmt.Sprintf("/clusters/%s/nodes/%s/workloads/desired", clusterID, nodeID)
if err := c.getJSON(ctx, path, &response); err != nil {
return nil, err
}
return response.DesiredWorkloads, nil
}
func (c *Client) ReportWorkloadStatus(ctx context.Context, clusterID, nodeID, serviceType string, request WorkloadStatusRequest) error {
path := fmt.Sprintf("/clusters/%s/nodes/%s/workloads/%s/status", clusterID, nodeID, serviceType)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) NodeVPNAssignments(ctx context.Context, clusterID, nodeID string) ([]NodeVPNAssignment, error) {
var response struct {
Assignments []NodeVPNAssignment `json:"vpn_assignments"`
}
path := fmt.Sprintf("/clusters/%s/nodes/%s/vpn/assignments", clusterID, nodeID)
if err := c.getJSON(ctx, path, &response); err != nil {
return nil, err
}
return response.Assignments, nil
}
func (c *Client) ReportNodeVPNAssignmentStatus(ctx context.Context, clusterID, nodeID, vpnConnectionID string, request NodeVPNAssignmentStatusRequest) error {
path := fmt.Sprintf("/clusters/%s/nodes/%s/vpn/assignments/%s/status", clusterID, nodeID, vpnConnectionID)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) AcquireNodeVPNAssignmentLease(ctx context.Context, clusterID, nodeID, vpnConnectionID string, request NodeVPNAssignmentLeaseAcquireRequest) (*NodeVPNAssignmentLease, error) {
var response struct {
Lease NodeVPNAssignmentLease `json:"lease"`
}
path := fmt.Sprintf("/clusters/%s/nodes/%s/vpn/assignments/%s/lease/acquire", clusterID, nodeID, vpnConnectionID)
if err := c.postJSON(ctx, path, request, &response); err != nil {
return nil, err
}
return &response.Lease, nil
}
func (c *Client) RenewNodeVPNAssignmentLease(ctx context.Context, clusterID, nodeID, vpnConnectionID, leaseID string, request NodeVPNAssignmentLeaseRenewRequest) error {
path := fmt.Sprintf("/clusters/%s/nodes/%s/vpn/assignments/%s/lease/%s/renew", clusterID, nodeID, vpnConnectionID, leaseID)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) SendVPNGatewayPacket(ctx context.Context, clusterID, vpnConnectionID string, packet []byte) error {
if len(packet) == 0 {
return nil
}
path := fmt.Sprintf("/clusters/%s/vpn-connections/%s/tunnel/gateway/packets", clusterID, vpnConnectionID)
return c.postBytes(ctx, path, packet)
}
func (c *Client) SendVPNGatewayPacketBatch(ctx context.Context, clusterID, vpnConnectionID string, packets [][]byte) error {
packets = cleanVPNPacketBatch(packets)
if len(packets) == 0 {
return nil
}
path := fmt.Sprintf("/clusters/%s/vpn-connections/%s/tunnel/gateway/packets?batch=true", clusterID, vpnConnectionID)
return c.postBytes(ctx, path, encodeVPNPacketBatch(packets))
}
func (c *Client) ReceiveVPNGatewayPacket(ctx context.Context, clusterID, vpnConnectionID string, timeout time.Duration) ([]byte, bool, error) {
path := fmt.Sprintf("/clusters/%s/vpn-connections/%s/tunnel/gateway/packets?timeout_ms=%d", clusterID, vpnConnectionID, timeout.Milliseconds())
return c.getBytes(ctx, path)
}
func (c *Client) ReceiveVPNGatewayPacketBatch(ctx context.Context, clusterID, vpnConnectionID string, timeout time.Duration) ([][]byte, error) {
path := fmt.Sprintf("/clusters/%s/vpn-connections/%s/tunnel/gateway/packets?batch=true&timeout_ms=%d", clusterID, vpnConnectionID, timeout.Milliseconds())
payload, ok, err := c.getBytes(ctx, path)
if err != nil || !ok {
return nil, err
}
return decodeVPNPacketBatch(payload)
}
func (c *Client) ReportMeshLink(ctx context.Context, clusterID string, request MeshLinkObservationRequest) error {
path := fmt.Sprintf("/clusters/%s/mesh/links", clusterID)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) ReportTelemetry(ctx context.Context, clusterID, nodeID string, request TelemetryRequest) error {
path := fmt.Sprintf("/clusters/%s/nodes/%s/telemetry", clusterID, nodeID)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) SyntheticMeshConfig(ctx context.Context, clusterID, nodeID string) (SyntheticMeshConfig, error) {
var response struct {
Config SyntheticMeshConfig `json:"synthetic_mesh_config"`
}
path := fmt.Sprintf("/clusters/%s/nodes/%s/mesh/synthetic-config", clusterID, nodeID)
if err := c.getJSON(ctx, path, &response); err != nil {
return SyntheticMeshConfig{}, err
}
return response.Config, nil
}
func (c *Client) AdminRuntimeProjection(ctx context.Context, clusterID, nodeID string, request AdminRuntimeProjectionRequest) (AdminRuntimeProjectionResponse, error) {
var response AdminRuntimeProjectionResponse
path := fmt.Sprintf("/clusters/%s/nodes/%s/admin-runtime/projection", clusterID, nodeID)
if err := c.postJSON(ctx, path, request, &response); err != nil {
return AdminRuntimeProjectionResponse{}, err
}
return response, nil
}
func (c *Client) RawControl(ctx context.Context, request RawControlRequest) (RawControlResponse, error) {
method := strings.ToUpper(strings.TrimSpace(request.Method))
if method == "" {
method = http.MethodGet
}
path := strings.TrimSpace(request.Path)
if !strings.HasPrefix(path, "/") {
return RawControlResponse{}, fmt.Errorf("control path must be relative")
}
var body io.Reader
if len(request.Body) > 0 && string(request.Body) != "null" {
body = bytes.NewReader(request.Body)
}
httpReq, err := http.NewRequestWithContext(ctx, method, c.baseURL+path, body)
if err != nil {
return RawControlResponse{}, err
}
if body != nil {
httpReq.Header.Set("Content-Type", "application/json")
}
httpResp, err := c.httpClient.Do(httpReq)
if err != nil {
return RawControlResponse{}, err
}
defer httpResp.Body.Close()
payload, err := io.ReadAll(io.LimitReader(httpResp.Body, 2*1024*1024))
if err != nil {
return RawControlResponse{}, err
}
return RawControlResponse{StatusCode: httpResp.StatusCode, Body: json.RawMessage(payload)}, nil
}
func (c *Client) getJSON(ctx context.Context, path string, response any) error {
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil)
if err != nil {
return err
}
httpResp, err := c.httpClient.Do(httpReq)
if err != nil {
return err
}
defer httpResp.Body.Close()
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
return fmt.Errorf("backend returned status %d", httpResp.StatusCode)
}
if response == nil {
return nil
}
return json.NewDecoder(httpResp.Body).Decode(response)
}
func (c *Client) getBytes(ctx context.Context, path string) ([]byte, bool, error) {
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil)
if err != nil {
return nil, false, err
}
httpResp, err := c.httpClient.Do(httpReq)
if err != nil {
return nil, false, err
}
defer httpResp.Body.Close()
if httpResp.StatusCode == http.StatusNoContent {
return nil, false, nil
}
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
return nil, false, fmt.Errorf("backend returned status %d", httpResp.StatusCode)
}
payload, err := io.ReadAll(io.LimitReader(httpResp.Body, vpnPacketBatchMaxBytes))
if err != nil {
return nil, false, err
}
if len(payload) == 0 {
return nil, false, nil
}
return payload, true, nil
}
func (c *Client) postBytes(ctx context.Context, path string, payload []byte) error {
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(payload))
if err != nil {
return err
}
httpReq.Header.Set("Content-Type", "application/octet-stream")
httpResp, err := c.httpClient.Do(httpReq)
if err != nil {
return err
}
defer httpResp.Body.Close()
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
return fmt.Errorf("backend returned status %d", httpResp.StatusCode)
}
return nil
}
func (c *Client) postJSON(ctx context.Context, path string, request any, response any) error {
payload, err := json.Marshal(request)
if err != nil {
return err
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(payload))
if err != nil {
return err
}
httpReq.Header.Set("Content-Type", "application/json")
httpResp, err := c.httpClient.Do(httpReq)
if err != nil {
return err
}
defer httpResp.Body.Close()
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
return fmt.Errorf("backend returned status %d", httpResp.StatusCode)
}
if response == nil {
return nil
}
return json.NewDecoder(httpResp.Body).Decode(response)
}
const (
vpnPacketMaxBytes = 65535
vpnPacketBatchMaxBytes = 4 * 1024 * 1024
)
func encodeVPNPacketBatch(packets [][]byte) []byte {
packets = cleanVPNPacketBatch(packets)
total := 0
for _, packet := range packets {
total += 4 + len(packet)
}
out := make([]byte, total)
offset := 0
for _, packet := range packets {
binary.BigEndian.PutUint32(out[offset:offset+4], uint32(len(packet)))
offset += 4
copy(out[offset:offset+len(packet)], packet)
offset += len(packet)
}
return out
}
func decodeVPNPacketBatch(payload []byte) ([][]byte, error) {
var packets [][]byte
for offset := 0; offset < len(payload); {
if offset+4 > len(payload) {
return nil, fmt.Errorf("truncated vpn packet batch header")
}
size := int(binary.BigEndian.Uint32(payload[offset : offset+4]))
offset += 4
if size <= 0 || size > vpnPacketMaxBytes {
return nil, fmt.Errorf("invalid vpn packet batch item size")
}
if offset+size > len(payload) {
return nil, fmt.Errorf("truncated vpn packet batch item")
}
packets = append(packets, append([]byte(nil), payload[offset:offset+size]...))
offset += size
}
return cleanVPNPacketBatch(packets), nil
}
func cleanVPNPacketBatch(packets [][]byte) [][]byte {
if len(packets) == 0 {
return nil
}
cleaned := make([][]byte, 0, len(packets))
for _, packet := range packets {
if len(packet) == 0 {
continue
}
cleaned = append(cleaned, append([]byte(nil), packet...))
}
return cleaned
}
func New(_ string) *Client { return &Client{} }
+42 -68
View File
@@ -14,7 +14,6 @@ import (
const MaxMeshProductionObservationSinkCapacity = 10000
type Config struct {
BackendURL string
ClusterID string
ClusterAuthorityPublicKey string
ClusterAuthorityFingerprint string
@@ -30,7 +29,7 @@ type Config struct {
HeartbeatInterval time.Duration
EnrollmentPollInterval time.Duration
EnrollmentPollTimeout time.Duration
MeshSyntheticRuntimeEnabled bool
FabricRuntimeEnabled bool
MeshProductionForwardingEnabled bool
VPNFabricSessionTransportEnabled bool
MeshQUICFabricEnabled bool
@@ -39,17 +38,18 @@ type Config struct {
VPNFabricQUICMaxStreamsPerConn int
VPNFabricQUICIdleTTL time.Duration
MeshProductionObservationSinkCapacity int
MeshListenAddr string
MeshListenPortMode string
MeshListenAutoPortStart int
MeshListenAutoPortEnd int
FabricListenAddr string
FabricListenPortMode string
FabricListenAutoPortStart int
FabricListenAutoPortEnd int
MeshAdvertiseEndpoint string
MeshAdvertiseEndpointsJSON string
FabricRegistryRecordsJSON string
MeshAdvertiseTransport string
MeshConnectivityMode string
MeshNATType string
MeshLocalSegmentID string
MeshSiteID string
MeshLocalityGroupID string
MeshNATGroupID string
MeshSTUNReflexiveEndpoint string
MeshSTUNServer string
@@ -72,7 +72,6 @@ func Load(args []string, env map[string]string) (Config, error) {
defaultStateDir := filepath.Join(".", ".rap-node-agent")
fs := flag.NewFlagSet("rap-node-agent", flag.ContinueOnError)
cfg := Config{}
fs.StringVar(&cfg.BackendURL, "backend-url", getEnv(env, "RAP_BACKEND_URL", "http://127.0.0.1:8080/api/v1"), "Backend API base URL.")
fs.StringVar(&cfg.ClusterID, "cluster-id", getEnv(env, "RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&cfg.ClusterAuthorityPublicKey, "cluster-authority-public-key", getEnv(env, "RAP_CLUSTER_AUTHORITY_PUBLIC_KEY", ""), "Pinned cluster authority Ed25519 public key.")
fs.StringVar(&cfg.ClusterAuthorityFingerprint, "cluster-authority-fingerprint", getEnv(env, "RAP_CLUSTER_AUTHORITY_FINGERPRINT", ""), "Pinned cluster authority key fingerprint.")
@@ -85,26 +84,27 @@ func Load(args []string, env map[string]string) (Config, error) {
fs.StringVar(&cfg.WebIngressSigningKeyID, "web-ingress-signing-key-id", getEnv(env, "RAP_WEB_INGRESS_SIGNING_KEY_ID", ""), "Optional key id for web ingress envelope signatures.")
fs.StringVar(&cfg.WebIngressTrustedKeysJSON, "web-ingress-trusted-keys-json", getEnv(env, "RAP_WEB_INGRESS_TRUSTED_KEYS_JSON", ""), "JSON map or array of trusted Ed25519 public keys for web ingress runtime receiver.")
fs.StringVar(&cfg.WebIngressRuntimeServiceClasses, "web-ingress-runtime-service-classes", getEnv(env, "RAP_WEB_INGRESS_RUNTIME_SERVICE_CLASSES", ""), "Optional comma-separated allow-list of web ingress runtime service classes accepted by this node.")
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getEnvBool(env, "RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable C17A synthetic fabric probe runtime. Disabled by default.")
fs.BoolVar(&cfg.FabricRuntimeEnabled, "fabric-runtime-enabled", getEnvBool(env, "RAP_FABRIC_RUNTIME_ENABLED", false), "Enable C17A synthetic fabric probe runtime. Disabled by default.")
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getEnvBool(env, "RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production fabric-control direct next-hop forwarding gate. Disabled by default.")
fs.BoolVar(&cfg.VPNFabricSessionTransportEnabled, "vpn-fabric-session-transport-enabled", getEnvBool(env, "RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED", false), "Route VPN packet transport over persistent fabric session when explicitly enabled. Disabled by default.")
fs.BoolVar(&cfg.MeshQUICFabricEnabled, "mesh-quic-fabric-enabled", getEnvBool(env, "RAP_MESH_QUIC_FABRIC_ENABLED", false), "Enable QUIC/UDP fabric listener. Disabled by default.")
fs.StringVar(&cfg.MeshQUICFabricListenAddr, "mesh-quic-fabric-listen-addr", getEnv(env, "RAP_MESH_QUIC_FABRIC_LISTEN_ADDR", ""), "Listen address for QUIC/UDP fabric endpoint, for example :19443.")
fs.IntVar(&cfg.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getEnvInt(env, "RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 4), "VPN fabric-session stream shards per traffic class.")
fs.IntVar(&cfg.VPNFabricSessionStreamShards, "vpn-fabric-session-stream-shards", getEnvInt(env, "RAP_VPN_FABRIC_SESSION_STREAM_SHARDS", 8), "VPN fabric-session stream shards per traffic class.")
fs.IntVar(&cfg.VPNFabricQUICMaxStreamsPerConn, "vpn-fabric-quic-max-streams-per-conn", getEnvInt(env, "RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN", 64), "Maximum logical fabric-session streams per cached VPN QUIC carrier connection.")
fs.DurationVar(&cfg.VPNFabricQUICIdleTTL, "vpn-fabric-quic-idle-ttl", time.Duration(getEnvInt(env, "RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS", 300))*time.Second, "Idle TTL for cached VPN QUIC carrier connections.")
fs.IntVar(&cfg.MeshProductionObservationSinkCapacity, "mesh-production-observation-sink-capacity", getEnvSignedInt(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Bounded local metadata-only production envelope observation sink capacity. Disabled when 0.")
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getEnv(env, "RAP_MESH_LISTEN_ADDR", ""), "Listen address for disabled-by-default historical synthetic mesh HTTP endpoint.")
fs.StringVar(&cfg.MeshListenPortMode, "mesh-listen-port-mode", getEnv(env, "RAP_MESH_LISTEN_PORT_MODE", "manual"), "Mesh listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.MeshListenAutoPortStart, "mesh-listen-auto-port-start", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_START", 19131), "First port used when mesh listen port mode is auto.")
fs.IntVar(&cfg.MeshListenAutoPortEnd, "mesh-listen-auto-port-end", getEnvInt(env, "RAP_MESH_LISTEN_AUTO_PORT_END", 19231), "Last port used when mesh listen port mode is auto.")
fs.StringVar(&cfg.FabricListenAddr, "fabric-listen-addr", getEnv(env, "RAP_FABRIC_LISTEN_ADDR", ""), "Optional node listener address used by the QUIC fabric runtime contract.")
fs.StringVar(&cfg.FabricListenPortMode, "fabric-listen-port-mode", getEnv(env, "RAP_FABRIC_LISTEN_PORT_MODE", "manual"), "Fabric listen port behavior: manual, auto, or disabled.")
fs.IntVar(&cfg.FabricListenAutoPortStart, "fabric-listen-auto-port-start", getEnvInt(env, "RAP_FABRIC_LISTEN_AUTO_PORT_START", 19131), "First port used when fabric listen port mode is auto.")
fs.IntVar(&cfg.FabricListenAutoPortEnd, "fabric-listen-auto-port-end", getEnvInt(env, "RAP_FABRIC_LISTEN_AUTO_PORT_END", 19231), "Last port used when fabric listen port mode is auto.")
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint reported to the Control Plane. Empty disables endpoint reporting.")
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "JSON array of advertised mesh endpoint candidates, including private/corporate endpoints.")
fs.StringVar(&cfg.FabricRegistryRecordsJSON, "fabric-registry-records-json", getEnv(env, "RAP_FABRIC_REGISTRY_RECORDS_JSON", ""), "JSON array of signed QUIC-only fabric registry gossip records used as bootstrap discovery seeds.")
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getEnv(env, "RAP_MESH_ADVERTISE_TRANSPORT", "quic"), "Transport label for the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getEnv(env, "RAP_MESH_CONNECTIVITY_MODE", "direct"), "Connectivity mode reported with the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getEnv(env, "RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint reported with the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshLocalSegmentID, "mesh-local-segment-id", getEnv(env, "RAP_MESH_LOCAL_SEGMENT_ID", ""), "Optional local LAN/site segment ID advertised with QUIC endpoint candidates.")
fs.StringVar(&cfg.MeshSiteID, "mesh-site-id", getEnv(env, "RAP_MESH_SITE_ID", ""), "Optional physical or logical site identifier advertised with QUIC endpoint candidates.")
fs.StringVar(&cfg.MeshLocalityGroupID, "mesh-locality-group-id", getEnv(env, "RAP_MESH_LOCALITY_GROUP_ID", ""), "Optional locality group identifier used to decide whether private QUIC endpoints are actually local.")
fs.StringVar(&cfg.MeshNATGroupID, "mesh-nat-group-id", getEnv(env, "RAP_MESH_NAT_GROUP_ID", ""), "Optional NAT group ID advertised with QUIC endpoint candidates.")
fs.StringVar(&cfg.MeshSTUNReflexiveEndpoint, "mesh-stun-reflexive-endpoint", getEnv(env, "RAP_MESH_STUN_REFLEXIVE_ENDPOINT", ""), "Optional STUN-discovered reflexive QUIC endpoint, for example quic://203.0.113.10:19443.")
fs.StringVar(&cfg.MeshSTUNServer, "mesh-stun-server", getEnv(env, "RAP_MESH_STUN_SERVER", ""), "Optional STUN server name used to discover the reflexive endpoint.")
@@ -127,21 +127,20 @@ func Load(args []string, env map[string]string) (Config, error) {
if err := fs.Parse(args); err != nil {
return Config{}, err
}
cfg.BackendURL = strings.TrimRight(strings.TrimSpace(cfg.BackendURL), "/")
cfg.ClusterID = strings.TrimSpace(cfg.ClusterID)
cfg.ClusterAuthorityPublicKey = strings.TrimSpace(cfg.ClusterAuthorityPublicKey)
cfg.ClusterAuthorityFingerprint = strings.TrimSpace(cfg.ClusterAuthorityFingerprint)
cfg.JoinToken = strings.TrimSpace(cfg.JoinToken)
cfg.NodeName = strings.TrimSpace(cfg.NodeName)
cfg.StateDir = strings.TrimSpace(cfg.StateDir)
cfg.MeshListenAddr = strings.TrimSpace(cfg.MeshListenAddr)
cfg.FabricListenAddr = strings.TrimSpace(cfg.FabricListenAddr)
cfg.MeshQUICFabricListenAddr = strings.TrimSpace(cfg.MeshQUICFabricListenAddr)
cfg.MeshListenPortMode = strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode))
cfg.FabricListenPortMode = strings.ToLower(strings.TrimSpace(cfg.FabricListenPortMode))
if cfg.VPNFabricSessionStreamShards <= 0 {
cfg.VPNFabricSessionStreamShards = 4
cfg.VPNFabricSessionStreamShards = 8
}
if cfg.VPNFabricSessionStreamShards > 64 {
cfg.VPNFabricSessionStreamShards = 64
if cfg.VPNFabricSessionStreamShards > 128 {
cfg.VPNFabricSessionStreamShards = 128
}
if cfg.VPNFabricQUICMaxStreamsPerConn <= 0 {
cfg.VPNFabricQUICMaxStreamsPerConn = 64
@@ -156,16 +155,15 @@ func Load(args []string, env map[string]string) (Config, error) {
if cfg.MeshAdvertiseTransport == "" {
cfg.MeshAdvertiseTransport = "quic"
}
cfg.MeshAdvertiseTransport = normalizeLegacyAdvertiseTransport(cfg.MeshAdvertiseTransport)
cfg.MeshAdvertiseEndpoint = normalizeLegacyEndpointSchemeToQUIC(cfg.MeshAdvertiseEndpoint)
cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode)
cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType)
cfg.MeshLocalSegmentID = strings.TrimSpace(cfg.MeshLocalSegmentID)
cfg.MeshSiteID = strings.TrimSpace(cfg.MeshSiteID)
cfg.MeshLocalityGroupID = strings.TrimSpace(cfg.MeshLocalityGroupID)
cfg.MeshNATGroupID = strings.TrimSpace(cfg.MeshNATGroupID)
cfg.MeshSTUNReflexiveEndpoint = normalizeLegacyEndpointSchemeToQUIC(strings.TrimRight(strings.TrimSpace(cfg.MeshSTUNReflexiveEndpoint), "/"))
cfg.MeshSTUNReflexiveEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshSTUNReflexiveEndpoint), "/")
cfg.MeshSTUNServer = strings.TrimSpace(cfg.MeshSTUNServer)
cfg.MeshRelayNodeID = strings.TrimSpace(cfg.MeshRelayNodeID)
cfg.MeshRelayEndpoint = normalizeLegacyEndpointSchemeToQUIC(strings.TrimRight(strings.TrimSpace(cfg.MeshRelayEndpoint), "/"))
cfg.MeshRelayEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshRelayEndpoint), "/")
cfg.MeshRegion = strings.TrimSpace(cfg.MeshRegion)
cfg.MeshSyntheticConfigPath = strings.TrimSpace(cfg.MeshSyntheticConfigPath)
cfg.MeshPeerEndpointsJSON = strings.TrimSpace(cfg.MeshPeerEndpointsJSON)
@@ -177,8 +175,8 @@ func Load(args []string, env map[string]string) (Config, error) {
cfg.RemoteWorkspaceRealAdapterCommand = strings.TrimSpace(cfg.RemoteWorkspaceRealAdapterCommand)
cfg.RemoteWorkspaceRealAdapterArgsJSON = strings.TrimSpace(cfg.RemoteWorkspaceRealAdapterArgsJSON)
cfg.RemoteWorkspaceRealAdapterWorkDir = strings.TrimSpace(cfg.RemoteWorkspaceRealAdapterWorkDir)
if cfg.BackendURL == "" {
return Config{}, errors.New("backend URL is required")
if cfg.FabricRegistryRecordsJSON == "" {
return Config{}, errors.New("fabric registry records are required")
}
if cfg.NodeName == "" {
return Config{}, errors.New("node name is required")
@@ -204,30 +202,30 @@ func Load(args []string, env map[string]string) (Config, error) {
if cfg.FabricRegistryRecordsJSON != "" && !isJSONArray(cfg.FabricRegistryRecordsJSON) {
return Config{}, errors.New("fabric registry records must be a JSON array")
}
switch cfg.MeshListenPortMode {
switch cfg.FabricListenPortMode {
case "", "manual", "auto", "disabled":
if cfg.MeshListenPortMode == "" {
cfg.MeshListenPortMode = "manual"
if cfg.FabricListenPortMode == "" {
cfg.FabricListenPortMode = "manual"
}
default:
return Config{}, errors.New("mesh listen port mode must be manual, auto, or disabled")
return Config{}, errors.New("fabric listen port mode must be manual, auto, or disabled")
}
if cfg.MeshListenAutoPortStart <= 0 || cfg.MeshListenAutoPortEnd <= 0 {
return Config{}, errors.New("mesh listen auto port range must be positive")
if cfg.FabricListenAutoPortStart <= 0 || cfg.FabricListenAutoPortEnd <= 0 {
return Config{}, errors.New("fabric listen auto port range must be positive")
}
if cfg.MeshListenAutoPortStart > cfg.MeshListenAutoPortEnd {
return Config{}, errors.New("mesh listen auto port start must be less than or equal to end")
if cfg.FabricListenAutoPortStart > cfg.FabricListenAutoPortEnd {
return Config{}, errors.New("fabric listen auto port start must be less than or equal to end")
}
if !isQUICAdvertiseTransport(cfg.MeshAdvertiseTransport) {
return Config{}, errors.New("mesh advertise transport must be a QUIC transport label")
}
if hasLegacyEndpointScheme(cfg.MeshAdvertiseEndpoint) {
if hasUnsupportedEndpointScheme(cfg.MeshAdvertiseEndpoint) {
return Config{}, errors.New("mesh advertise endpoint must be a QUIC endpoint")
}
if cfg.MeshSTUNReflexiveEndpoint != "" && hasLegacyEndpointScheme(cfg.MeshSTUNReflexiveEndpoint) {
if cfg.MeshSTUNReflexiveEndpoint != "" && hasUnsupportedEndpointScheme(cfg.MeshSTUNReflexiveEndpoint) {
return Config{}, errors.New("mesh STUN reflexive endpoint must be a QUIC endpoint")
}
if cfg.MeshRelayEndpoint != "" && hasLegacyEndpointScheme(cfg.MeshRelayEndpoint) {
if cfg.MeshRelayEndpoint != "" && hasUnsupportedEndpointScheme(cfg.MeshRelayEndpoint) {
return Config{}, errors.New("mesh relay endpoint must be a QUIC endpoint")
}
return cfg, nil
@@ -242,36 +240,12 @@ func isQUICAdvertiseTransport(label string) bool {
}
}
func normalizeLegacyAdvertiseTransport(label string) string {
switch strings.ToLower(strings.TrimSpace(label)) {
case "direct_http", "direct_https", "direct_tcp_tls", "http", "https", "ws", "wss", "websocket":
return "direct_quic"
case "outbound_reverse", "reverse", "reverse_outbound":
return "reverse_quic"
case "relay", "relay_control":
return "relay_quic"
default:
return strings.TrimSpace(label)
}
}
func normalizeLegacyEndpointSchemeToQUIC(endpoint string) string {
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
lower := strings.ToLower(endpoint)
for _, prefix := range []string{"http://", "https://", "ws://", "wss://"} {
if strings.HasPrefix(lower, prefix) {
return "quic://" + endpoint[len(prefix):]
}
}
return endpoint
}
func hasLegacyEndpointScheme(endpoint string) bool {
func hasUnsupportedEndpointScheme(endpoint string) bool {
endpoint = strings.ToLower(strings.TrimSpace(endpoint))
return strings.HasPrefix(endpoint, "http://") ||
strings.HasPrefix(endpoint, "https://") ||
strings.HasPrefix(endpoint, "ws://") ||
strings.HasPrefix(endpoint, "wss://")
if endpoint == "" || !strings.Contains(endpoint, "://") {
return false
}
return !strings.HasPrefix(endpoint, "quic://")
}
func isJSONArray(value string) bool {
@@ -1,13 +1,13 @@
package config
import (
"strings"
"testing"
"time"
)
func TestLoadConfigFromEnvAndArgs(t *testing.T) {
cfg, err := Load([]string{"-node-name", "node-b"}, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1/",
"RAP_CLUSTER_ID": "cluster-1",
"RAP_CLUSTER_AUTHORITY_PUBLIC_KEY": "public-key-b64",
"RAP_CLUSTER_AUTHORITY_FINGERPRINT": "rap-ca-ed25519-test",
@@ -23,7 +23,7 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
"RAP_HEARTBEAT_INTERVAL_SECONDS": "7",
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS": "3",
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS": "30",
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED": "true",
"RAP_FABRIC_RUNTIME_ENABLED": "true",
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED": "true",
"RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED": "true",
"RAP_MESH_QUIC_FABRIC_ENABLED": "true",
@@ -32,17 +32,18 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
"RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN": "24",
"RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS": "120",
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "5",
"RAP_MESH_LISTEN_ADDR": "127.0.0.1:19001",
"RAP_MESH_LISTEN_PORT_MODE": "auto",
"RAP_MESH_LISTEN_AUTO_PORT_START": "19010",
"RAP_MESH_LISTEN_AUTO_PORT_END": "19020",
"RAP_FABRIC_LISTEN_ADDR": "127.0.0.1:19001",
"RAP_FABRIC_LISTEN_PORT_MODE": "auto",
"RAP_FABRIC_LISTEN_AUTO_PORT_START": "19010",
"RAP_FABRIC_LISTEN_AUTO_PORT_END": "19020",
"RAP_MESH_ADVERTISE_ENDPOINT": "quic://node-a.example.test:19443/",
"RAP_MESH_ADVERTISE_ENDPOINTS_JSON": `[{"endpoint_id":"node-a-lan","address":"10.10.0.20:19001"}]`,
"RAP_FABRIC_REGISTRY_RECORDS_JSON": ` [{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}] `,
"RAP_MESH_ADVERTISE_TRANSPORT": "direct_quic",
"RAP_MESH_CONNECTIVITY_MODE": "outbound_only",
"RAP_MESH_NAT_TYPE": "symmetric",
"RAP_MESH_LOCAL_SEGMENT_ID": "site-a",
"RAP_MESH_SITE_ID": "home",
"RAP_MESH_LOCALITY_GROUP_ID": "home-lan",
"RAP_MESH_NAT_GROUP_ID": "nat-a",
"RAP_MESH_STUN_REFLEXIVE_ENDPOINT": "quic://203.0.113.20:19443/",
"RAP_MESH_STUN_SERVER": "stun.example.test:3478",
@@ -50,7 +51,7 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
"RAP_MESH_RELAY_ENDPOINT": "quic://node-r.example.test:19443/",
"RAP_MESH_REGION": "eu",
"RAP_MESH_SYNTHETIC_CONFIG": "/tmp/rap-node/mesh-synthetic.json",
"RAP_MESH_PEER_ENDPOINTS_JSON": `{"node-b":"http://127.0.0.1:19002"}`,
"RAP_MESH_PEER_ENDPOINTS_JSON": `{"node-b":"quic://127.0.0.1:19002"}`,
"RAP_MESH_SYNTHETIC_ROUTES_JSON": `[{"route_id":"route-1"}]`,
"RAP_REMOTE_WORKSPACE_REAL_ADAPTER_ENABLED": "true",
"RAP_REMOTE_WORKSPACE_REAL_ADAPTER_COMMAND": " /opt/rap/bin/rdp-worker ",
@@ -60,9 +61,6 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
if err != nil {
t.Fatalf("load config: %v", err)
}
if cfg.BackendURL != "http://backend/api/v1" {
t.Fatalf("BackendURL = %q", cfg.BackendURL)
}
if cfg.NodeName != "node-b" {
t.Fatalf("NodeName = %q", cfg.NodeName)
}
@@ -87,8 +85,8 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
cfg.WebIngressRuntimeServiceClasses != "platform_admin, cluster_admin" {
t.Fatalf("unexpected web ingress key config: %+v", cfg)
}
if !cfg.MeshSyntheticRuntimeEnabled {
t.Fatal("MeshSyntheticRuntimeEnabled = false, want true")
if !cfg.FabricRuntimeEnabled {
t.Fatal("FabricRuntimeEnabled = false, want true")
}
if !cfg.MeshProductionForwardingEnabled {
t.Fatal("MeshProductionForwardingEnabled = false, want true")
@@ -111,11 +109,11 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
if cfg.MeshProductionObservationSinkCapacity != 5 {
t.Fatalf("MeshProductionObservationSinkCapacity = %d, want 5", cfg.MeshProductionObservationSinkCapacity)
}
if cfg.MeshListenAddr != "127.0.0.1:19001" {
t.Fatalf("MeshListenAddr = %q", cfg.MeshListenAddr)
if cfg.FabricListenAddr != "127.0.0.1:19001" {
t.Fatalf("FabricListenAddr = %q", cfg.FabricListenAddr)
}
if cfg.MeshListenPortMode != "auto" || cfg.MeshListenAutoPortStart != 19010 || cfg.MeshListenAutoPortEnd != 19020 {
t.Fatalf("unexpected mesh listen port config: %+v", cfg)
if cfg.FabricListenPortMode != "auto" || cfg.FabricListenAutoPortStart != 19010 || cfg.FabricListenAutoPortEnd != 19020 {
t.Fatalf("unexpected fabric listen port config: %+v", cfg)
}
if cfg.MeshAdvertiseEndpoint != "quic://node-a.example.test:19443" ||
cfg.MeshAdvertiseEndpointsJSON == "" ||
@@ -123,7 +121,8 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
cfg.MeshAdvertiseTransport != "direct_quic" ||
cfg.MeshConnectivityMode != "outbound_only" ||
cfg.MeshNATType != "symmetric" ||
cfg.MeshLocalSegmentID != "site-a" ||
cfg.MeshSiteID != "home" ||
cfg.MeshLocalityGroupID != "home-lan" ||
cfg.MeshNATGroupID != "nat-a" ||
cfg.MeshSTUNReflexiveEndpoint != "quic://203.0.113.20:19443" ||
cfg.MeshSTUNServer != "stun.example.test:3478" ||
@@ -146,10 +145,24 @@ func TestLoadConfigFromEnvAndArgs(t *testing.T) {
}
}
func TestLoadConfigLoadsLocalityGroup(t *testing.T) {
cfg, err := Load(nil, map[string]string{
"RAP_NODE_NAME": "node-a",
"RAP_FABRIC_REGISTRY_RECORDS_JSON": `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
"RAP_MESH_LOCALITY_GROUP_ID": "home-lan",
})
if err != nil {
t.Fatalf("load config: %v", err)
}
if cfg.MeshLocalityGroupID != "home-lan" {
t.Fatalf("unexpected locality group: %+v", cfg)
}
}
func TestLoadConfigDefaultsEnrollmentPollingToNoTimeout(t *testing.T) {
cfg, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
"RAP_NODE_NAME": "node-a",
"RAP_FABRIC_REGISTRY_RECORDS_JSON": `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
})
if err != nil {
t.Fatalf("load config: %v", err)
@@ -168,10 +181,31 @@ func TestLoadConfigDefaultsEnrollmentPollingToNoTimeout(t *testing.T) {
}
}
func TestLoadConfigRequiresFabricBootstrap(t *testing.T) {
_, err := Load([]string{
"--node-name", "node-a",
"--state-dir", t.TempDir(),
"--fabric-registry-records-json", `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
}, map[string]string{})
if err != nil {
t.Fatalf("load config: %v", err)
}
}
func TestLoadConfigRejectsMissingFabricBootstrap(t *testing.T) {
_, err := Load([]string{
"--node-name", "node-a",
"--state-dir", t.TempDir(),
}, map[string]string{})
if err == nil || !strings.Contains(err.Error(), "fabric registry records are required") {
t.Fatalf("expected fabric validation error, got %v", err)
}
}
func TestLoadConfigRejectsNegativeProductionObservationSinkCapacity(t *testing.T) {
_, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
"RAP_NODE_NAME": "node-a",
"RAP_FABRIC_REGISTRY_RECORDS_JSON": `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "-1",
})
if err == nil {
@@ -181,8 +215,8 @@ func TestLoadConfigRejectsNegativeProductionObservationSinkCapacity(t *testing.T
func TestLoadConfigRejectsTooLargeProductionObservationSinkCapacity(t *testing.T) {
_, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
"RAP_NODE_NAME": "node-a",
"RAP_FABRIC_REGISTRY_RECORDS_JSON": `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "10001",
})
if err == nil {
@@ -190,32 +224,26 @@ func TestLoadConfigRejectsTooLargeProductionObservationSinkCapacity(t *testing.T
}
}
func TestLoadConfigNormalizesLegacyMeshAdvertiseTransport(t *testing.T) {
cfg, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
"RAP_MESH_ADVERTISE_ENDPOINT": "quic://node-a.example.test:19443",
"RAP_MESH_ADVERTISE_TRANSPORT": "wss",
func TestLoadConfigRejectsDisallowedMeshAdvertiseTransport(t *testing.T) {
_, err := Load(nil, map[string]string{
"RAP_NODE_NAME": "node-a",
"RAP_FABRIC_REGISTRY_RECORDS_JSON": `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
"RAP_MESH_ADVERTISE_ENDPOINT": "quic://node-a.example.test:19443",
"RAP_MESH_ADVERTISE_TRANSPORT": "wss",
})
if err != nil {
t.Fatalf("Load returned error for legacy mesh advertise transport migration: %v", err)
}
if cfg.MeshAdvertiseTransport != "direct_quic" {
t.Fatalf("transport = %q, want direct_quic", cfg.MeshAdvertiseTransport)
if err == nil || !strings.Contains(err.Error(), "QUIC transport label") {
t.Fatalf("expected QUIC transport rejection, got %v", err)
}
}
func TestLoadConfigNormalizesLegacyMeshAdvertiseEndpointScheme(t *testing.T) {
cfg, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
"RAP_MESH_ADVERTISE_ENDPOINT": "https://node-a.example.test:443",
"RAP_MESH_ADVERTISE_TRANSPORT": "direct_quic",
func TestLoadConfigRejectsDisallowedMeshAdvertiseEndpointScheme(t *testing.T) {
_, err := Load(nil, map[string]string{
"RAP_NODE_NAME": "node-a",
"RAP_FABRIC_REGISTRY_RECORDS_JSON": `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
"RAP_MESH_ADVERTISE_ENDPOINT": "https://node-a.example.test:443",
"RAP_MESH_ADVERTISE_TRANSPORT": "direct_quic",
})
if err != nil {
t.Fatalf("Load returned error for legacy mesh advertise endpoint migration: %v", err)
}
if cfg.MeshAdvertiseEndpoint != "quic://node-a.example.test:443" {
t.Fatalf("endpoint = %q, want quic scheme", cfg.MeshAdvertiseEndpoint)
if err == nil || !strings.Contains(err.Error(), "QUIC endpoint") {
t.Fatalf("expected QUIC endpoint rejection, got %v", err)
}
}
@@ -11,8 +11,13 @@ const (
Magic uint32 = 0x52415046 // RAPF
Version uint8 = 1
HeaderSize = 32
DefaultMaxPayload = 1024 * 1024
HeaderSize = 32
// DefaultMaxPayload is a per-frame guardrail, not a throughput limit.
// Fabric services must scale by many QUIC streams and many frames; keeping
// this above common VPN/RDP/VNC burst batches avoids a hidden 1 MiB choke
// while still bounding memory for a single decoded frame.
DefaultMaxPayload = 8 * 1024 * 1024
)
type FrameType uint8
@@ -102,6 +102,26 @@ func TestRejectsOversizedPayload(t *testing.T) {
}
}
func TestDefaultPayloadAllowsMultiMegabyteServiceBatches(t *testing.T) {
payload := bytes.Repeat([]byte("x"), 2*1024*1024)
frame := Frame{
Type: FrameData,
StreamID: 1,
Payload: payload,
}
encoded, err := MarshalFrame(frame)
if err != nil {
t.Fatalf("marshal multi-megabyte frame: %v", err)
}
decoded, err := UnmarshalFrame(encoded, DefaultMaxPayload)
if err != nil {
t.Fatalf("unmarshal multi-megabyte frame: %v", err)
}
if len(decoded.Payload) != len(payload) {
t.Fatalf("payload length = %d, want %d", len(decoded.Payload), len(payload))
}
}
func TestRejectsUnknownTrafficClass(t *testing.T) {
frame := Frame{
Type: FrameOpenStream,
@@ -8,6 +8,7 @@ import (
const (
DefaultInitialStreamCredit = 32
DefaultMaxStreamCredit = 4096
)
var (
@@ -29,6 +30,7 @@ const (
type SessionConfig struct {
InitialStreamCredit int
MaxStreamCredit int
ClassQueueCapacity map[TrafficClass]int
}
@@ -188,6 +190,7 @@ func (s *Session) Ack(streamID uint64, sequence uint64) error {
delta := sequence - st.metrics.Acked
st.metrics.Acked = sequence
s.metrics.FramesAcked += delta
st.credit = minInt(st.credit+int(delta), s.cfg.MaxStreamCredit)
}
return nil
}
@@ -205,7 +208,7 @@ func (s *Session) AddCredit(streamID uint64, frames int) error {
if st.state != StreamStateOpen {
return ErrStreamClosed
}
st.credit += frames
st.credit = minInt(st.credit+frames, s.cfg.MaxStreamCredit)
return nil
}
@@ -311,6 +314,12 @@ func normalizeSessionConfig(cfg SessionConfig) SessionConfig {
if cfg.InitialStreamCredit <= 0 {
cfg.InitialStreamCredit = DefaultInitialStreamCredit
}
if cfg.MaxStreamCredit <= 0 {
cfg.MaxStreamCredit = maxInt(DefaultMaxStreamCredit, cfg.InitialStreamCredit)
}
if cfg.InitialStreamCredit > cfg.MaxStreamCredit {
cfg.InitialStreamCredit = cfg.MaxStreamCredit
}
if cfg.ClassQueueCapacity == nil {
cfg.ClassQueueCapacity = map[TrafficClass]int{}
}
@@ -331,14 +340,28 @@ func priorityOrder() []TrafficClass {
func defaultClassQueueCapacity(trafficClass TrafficClass) int {
switch trafficClass {
case TrafficClassControl, TrafficClassDNS, TrafficClassInteractive:
return 128
return 1024
case TrafficClassReliable:
return 64
return 512
case TrafficClassBulk:
return 16
return 256
case TrafficClassDroppable:
return 8
return 128
default:
return 32
return 256
}
}
func minInt(a, b int) int {
if a < b {
return a
}
return b
}
func maxInt(a, b int) int {
if a > b {
return a
}
return b
}
@@ -129,20 +129,36 @@ func TestSessionResetDropsOnlySelectedStream(t *testing.T) {
}
func TestSessionAckUpdatesMetrics(t *testing.T) {
session := NewSession(SessionConfig{})
session := NewSession(SessionConfig{InitialStreamCredit: 2})
mustOpenStream(t, session, 1, TrafficClassReliable)
mustEnqueue(t, session, 1, "one")
mustEnqueue(t, session, 1, "two")
if _, err := session.EnqueueData(1, []byte("blocked")); !errors.Is(err, ErrStreamCreditExhausted) {
t.Fatalf("credit error = %v, want %v", err, ErrStreamCreditExhausted)
}
if err := session.Ack(1, 2); err != nil {
t.Fatalf("ack: %v", err)
}
mustEnqueue(t, session, 1, "three")
snapshot := session.Snapshot()
if snapshot.FramesAcked != 2 || snapshot.Streams[1].Acked != 2 {
if snapshot.FramesAcked != 2 || snapshot.Streams[1].Acked != 2 || snapshot.Streams[1].Credit != 1 {
t.Fatalf("ack metrics = %+v stream=%+v", snapshot, snapshot.Streams[1])
}
}
func TestSessionCreditIsCapped(t *testing.T) {
session := NewSession(SessionConfig{InitialStreamCredit: 1, MaxStreamCredit: 2})
mustOpenStream(t, session, 1, TrafficClassReliable)
if err := session.AddCredit(1, 100); err != nil {
t.Fatalf("add credit: %v", err)
}
snapshot := session.Snapshot()
if snapshot.Streams[1].Credit != 2 {
t.Fatalf("credit = %d, want cap 2", snapshot.Streams[1].Credit)
}
}
func TestSessionCloseRejectsNewData(t *testing.T) {
session := NewSession(SessionConfig{})
mustOpenStream(t, session, 1, TrafficClassReliable)
@@ -15,8 +15,8 @@ const (
)
type RuntimeConfig struct {
BackendURL string
ClusterID string
ClusterAuthorityPublicKey string
JoinToken string
NodeName string
Image string
@@ -28,7 +28,7 @@ type RuntimeConfig struct {
Replace bool
DockerVPNGatewayEnabled bool
WorkloadSupervisionEnabled bool
MeshSyntheticRuntimeEnabled bool
FabricRuntimeEnabled bool
MeshProductionForwardingEnabled bool
VPNFabricSessionTransportEnabled bool
MeshQUICFabricEnabled bool
@@ -36,16 +36,19 @@ type RuntimeConfig struct {
VPNFabricSessionStreamShards int
VPNFabricQUICMaxStreamsPerConn int
VPNFabricQUICIdleTTLSeconds int
MeshListenAddr string
MeshListenPortMode string
MeshListenAutoPortStart int
MeshListenAutoPortEnd int
FabricListenAddr string
FabricListenPortMode string
FabricListenAutoPortStart int
FabricListenAutoPortEnd int
MeshAdvertiseEndpoint string
MeshAdvertiseEndpointsJSON string
FabricRegistryRecordsJSON string
MeshAdvertiseTransport string
MeshConnectivityMode string
MeshNATType string
MeshSiteID string
MeshLocalityGroupID string
MeshNATGroupID string
MeshRegion string
HeartbeatIntervalSeconds int
EnrollmentPollIntervalSeconds int
@@ -59,8 +62,8 @@ type RuntimeConfig struct {
}
func (cfg RuntimeConfig) Normalize() RuntimeConfig {
cfg.BackendURL = strings.TrimRight(strings.TrimSpace(cfg.BackendURL), "/")
cfg.ClusterID = strings.TrimSpace(cfg.ClusterID)
cfg.ClusterAuthorityPublicKey = strings.TrimSpace(cfg.ClusterAuthorityPublicKey)
cfg.JoinToken = strings.TrimSpace(cfg.JoinToken)
cfg.NodeName = strings.TrimSpace(cfg.NodeName)
cfg.Image = firstNonEmpty(cfg.Image, DefaultImage)
@@ -68,13 +71,13 @@ func (cfg RuntimeConfig) Normalize() RuntimeConfig {
cfg.StateDir = firstNonEmpty(cfg.StateDir, DefaultStateDir)
cfg.Network = firstNonEmpty(cfg.Network, DefaultNetwork)
cfg.RestartPolicy = firstNonEmpty(cfg.RestartPolicy, "unless-stopped")
cfg.MeshListenAddr = strings.TrimSpace(cfg.MeshListenAddr)
cfg.FabricListenAddr = strings.TrimSpace(cfg.FabricListenAddr)
cfg.MeshQUICFabricListenAddr = strings.TrimSpace(cfg.MeshQUICFabricListenAddr)
if cfg.VPNFabricSessionStreamShards <= 0 {
cfg.VPNFabricSessionStreamShards = 4
cfg.VPNFabricSessionStreamShards = 8
}
if cfg.VPNFabricSessionStreamShards > 64 {
cfg.VPNFabricSessionStreamShards = 64
if cfg.VPNFabricSessionStreamShards > 128 {
cfg.VPNFabricSessionStreamShards = 128
}
if cfg.VPNFabricQUICMaxStreamsPerConn <= 0 {
cfg.VPNFabricQUICMaxStreamsPerConn = 64
@@ -82,13 +85,16 @@ func (cfg RuntimeConfig) Normalize() RuntimeConfig {
if cfg.VPNFabricQUICIdleTTLSeconds <= 0 {
cfg.VPNFabricQUICIdleTTLSeconds = 300
}
cfg.MeshListenPortMode = strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode))
cfg.FabricListenPortMode = strings.ToLower(strings.TrimSpace(cfg.FabricListenPortMode))
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
cfg.FabricRegistryRecordsJSON = strings.TrimSpace(cfg.FabricRegistryRecordsJSON)
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode)
cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType)
cfg.MeshSiteID = strings.TrimSpace(cfg.MeshSiteID)
cfg.MeshLocalityGroupID = strings.TrimSpace(cfg.MeshLocalityGroupID)
cfg.MeshNATGroupID = strings.TrimSpace(cfg.MeshNATGroupID)
cfg.MeshRegion = strings.TrimSpace(cfg.MeshRegion)
cfg.ImageArtifactSHA256 = strings.TrimSpace(cfg.ImageArtifactSHA256)
if cfg.HeartbeatIntervalSeconds == 0 {
@@ -103,12 +109,15 @@ func (cfg RuntimeConfig) Normalize() RuntimeConfig {
func (cfg RuntimeConfig) ValidateInstall() error {
cfg = cfg.Normalize()
var missing []string
if cfg.BackendURL == "" {
missing = append(missing, "backend-url")
if cfg.FabricRegistryRecordsJSON == "" {
missing = append(missing, "fabric-registry-records-json")
}
if cfg.ClusterID == "" {
missing = append(missing, "cluster-id")
}
if cfg.ClusterAuthorityPublicKey == "" && !cfg.Replace {
missing = append(missing, "cluster-authority-public-key")
}
if cfg.NodeName == "" {
missing = append(missing, "node-name")
}
@@ -127,21 +136,21 @@ func (cfg RuntimeConfig) ValidateInstall() error {
if cfg.EnrollmentPollTimeoutSeconds < 0 {
return errors.New("enrollment poll timeout must not be negative")
}
switch cfg.MeshListenPortMode {
switch cfg.FabricListenPortMode {
case "", "manual", "auto", "disabled":
default:
return errors.New("mesh listen port mode must be manual, auto, or disabled")
return errors.New("fabric listen port mode must be manual, auto, or disabled")
}
if cfg.MeshListenAutoPortStart < 0 || cfg.MeshListenAutoPortEnd < 0 {
return errors.New("mesh listen auto port range must not be negative")
if cfg.FabricListenAutoPortStart < 0 || cfg.FabricListenAutoPortEnd < 0 {
return errors.New("fabric listen auto port range must not be negative")
}
if cfg.MeshListenAutoPortStart > 0 && cfg.MeshListenAutoPortEnd > 0 && cfg.MeshListenAutoPortStart > cfg.MeshListenAutoPortEnd {
return errors.New("mesh listen auto port start must be less than or equal to end")
if cfg.FabricListenAutoPortStart > 0 && cfg.FabricListenAutoPortEnd > 0 && cfg.FabricListenAutoPortStart > cfg.FabricListenAutoPortEnd {
return errors.New("fabric listen auto port start must be less than or equal to end")
}
if cfg.MeshAdvertiseTransport != "" && !isQUICAdvertiseTransport(cfg.MeshAdvertiseTransport) {
return errors.New("mesh advertise transport must be a QUIC transport label")
}
if hasLegacyEndpointScheme(cfg.MeshAdvertiseEndpoint) {
if hasUnsupportedEndpointScheme(cfg.MeshAdvertiseEndpoint) {
return errors.New("mesh advertise endpoint must be a QUIC endpoint")
}
if cfg.ProductionObservationSinkCap < 0 {
@@ -174,12 +183,12 @@ func isQUICAdvertiseTransport(label string) bool {
}
}
func hasLegacyEndpointScheme(endpoint string) bool {
func hasUnsupportedEndpointScheme(endpoint string) bool {
endpoint = strings.ToLower(strings.TrimSpace(endpoint))
return strings.HasPrefix(endpoint, "http://") ||
strings.HasPrefix(endpoint, "https://") ||
strings.HasPrefix(endpoint, "ws://") ||
strings.HasPrefix(endpoint, "wss://")
if endpoint == "" || !strings.Contains(endpoint, "://") {
return false
}
return !strings.HasPrefix(endpoint, "quic://")
}
func isJSONArray(value string) bool {
@@ -6,7 +6,6 @@ import (
"encoding/hex"
"fmt"
"io"
"net/http"
"os"
"os/exec"
"path/filepath"
@@ -126,15 +125,15 @@ func (m DockerManager) ensureImageFromArtifact(ctx context.Context, runner Comma
return true, nil
}
func downloadFirstArtifact(ctx context.Context, urls []string, expectedSHA256 string, expectedSizeBytes int64) (string, error) {
func downloadFirstArtifact(ctx context.Context, paths []string, expectedSHA256 string, expectedSizeBytes int64) (string, error) {
var lastErr error
for _, rawURL := range urls {
rawURL = strings.TrimSpace(rawURL)
if rawURL == "" {
for _, rawPath := range paths {
rawPath = strings.TrimSpace(rawPath)
if rawPath == "" {
continue
}
for attempt := 1; attempt <= 3; attempt++ {
path, err := downloadArtifact(ctx, rawURL, expectedSHA256, expectedSizeBytes)
path, err := downloadArtifact(ctx, rawPath, expectedSHA256, expectedSizeBytes)
if err == nil {
return path, nil
}
@@ -144,29 +143,34 @@ func downloadFirstArtifact(ctx context.Context, urls []string, expectedSHA256 st
if lastErr != nil {
return "", lastErr
}
return "", fmt.Errorf("no artifact URLs configured")
return "", fmt.Errorf("no artifact paths configured")
}
func downloadArtifact(ctx context.Context, rawURL, expectedSHA256 string, expectedSizeBytes int64) (string, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
func downloadArtifact(ctx context.Context, rawPath, expectedSHA256 string, expectedSizeBytes int64) (string, error) {
select {
case <-ctx.Done():
return "", ctx.Err()
default:
}
source := strings.TrimSpace(rawPath)
if source == "" {
return "", fmt.Errorf("artifact path is empty")
}
if strings.Contains(source, "://") {
return "", fmt.Errorf("network artifact reference %q is disabled; update artifacts must arrive via quic fabric", source)
}
input, err := os.Open(source)
if err != nil {
return "", err
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
return "", fmt.Errorf("download artifact %s: %w", rawURL, err)
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return "", fmt.Errorf("download artifact %s: %s", rawURL, resp.Status)
return "", fmt.Errorf("open artifact %s: %w", source, err)
}
defer input.Close()
file, err := os.CreateTemp("", "rap-docker-image-*.tar")
if err != nil {
return "", err
}
path := file.Name()
hasher := sha256.New()
written, copyErr := io.Copy(io.MultiWriter(file, hasher), resp.Body)
written, copyErr := io.Copy(io.MultiWriter(file, hasher), input)
closeErr := file.Close()
if copyErr != nil {
os.Remove(path)
@@ -176,21 +180,17 @@ func downloadArtifact(ctx context.Context, rawURL, expectedSHA256 string, expect
os.Remove(path)
return "", closeErr
}
if resp.ContentLength >= 0 && written != resp.ContentLength {
os.Remove(path)
return "", fmt.Errorf("artifact download truncated for %s: got %d bytes want content-length %d", rawURL, written, resp.ContentLength)
}
if expectedSizeBytes > 0 && written != expectedSizeBytes {
if strings.TrimSpace(expectedSHA256) != "" {
os.Remove(path)
return "", fmt.Errorf("artifact size mismatch for %s: got %d bytes want %d", rawURL, written, expectedSizeBytes)
return "", fmt.Errorf("artifact size mismatch for %s: got %d bytes want %d", source, written, expectedSizeBytes)
}
fmt.Printf("artifact size mismatch for %s: got %d bytes want %d; proceeding without checksum for backward-compatible installs\n", rawURL, written, expectedSizeBytes)
fmt.Printf("artifact size mismatch for %s: got %d bytes want %d; proceeding because checksum is absent\n", source, written, expectedSizeBytes)
}
actual := hex.EncodeToString(hasher.Sum(nil))
if expected := strings.TrimSpace(expectedSHA256); expected != "" && !strings.EqualFold(actual, expected) {
os.Remove(path)
return "", fmt.Errorf("artifact checksum mismatch for %s: got %s want %s", rawURL, actual, expected)
return "", fmt.Errorf("artifact checksum mismatch for %s: got %s want %s", source, actual, expected)
}
return path, nil
}
@@ -254,7 +254,6 @@ func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
cfg = cfg.Normalize()
stateDir = firstNonEmpty(stateDir, cfg.StateDir)
env := []string{
"RAP_BACKEND_URL=" + cfg.BackendURL,
"RAP_CLUSTER_ID=" + cfg.ClusterID,
"RAP_NODE_NAME=" + cfg.NodeName,
"RAP_NODE_STATE_DIR=" + stateDir,
@@ -262,7 +261,7 @@ func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS=" + strconv.Itoa(cfg.EnrollmentPollIntervalSeconds),
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=" + strconv.Itoa(cfg.EnrollmentPollTimeoutSeconds),
"RAP_WORKLOAD_SUPERVISION_ENABLED=" + boolString(cfg.WorkloadSupervisionEnabled),
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=" + boolString(cfg.MeshSyntheticRuntimeEnabled),
"RAP_FABRIC_RUNTIME_ENABLED=" + boolString(cfg.FabricRuntimeEnabled),
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED=" + boolString(cfg.MeshProductionForwardingEnabled),
"RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED=" + boolString(cfg.VPNFabricSessionTransportEnabled),
"RAP_MESH_QUIC_FABRIC_ENABLED=" + boolString(cfg.MeshQUICFabricEnabled),
@@ -270,23 +269,26 @@ func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
"RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN=" + strconv.Itoa(cfg.VPNFabricQUICMaxStreamsPerConn),
"RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS=" + strconv.Itoa(cfg.VPNFabricQUICIdleTTLSeconds),
}
if cfg.ClusterAuthorityPublicKey != "" {
env = append(env, "RAP_CLUSTER_AUTHORITY_PUBLIC_KEY="+cfg.ClusterAuthorityPublicKey)
}
if cfg.JoinToken != "" {
env = append(env, "RAP_JOIN_TOKEN="+cfg.JoinToken)
}
if cfg.MeshListenAddr != "" {
env = append(env, "RAP_MESH_LISTEN_ADDR="+cfg.MeshListenAddr)
if cfg.FabricListenAddr != "" {
env = append(env, "RAP_FABRIC_LISTEN_ADDR="+cfg.FabricListenAddr)
}
if cfg.MeshQUICFabricListenAddr != "" {
env = append(env, "RAP_MESH_QUIC_FABRIC_LISTEN_ADDR="+cfg.MeshQUICFabricListenAddr)
}
if cfg.MeshListenPortMode != "" {
env = append(env, "RAP_MESH_LISTEN_PORT_MODE="+cfg.MeshListenPortMode)
if cfg.FabricListenPortMode != "" {
env = append(env, "RAP_FABRIC_LISTEN_PORT_MODE="+cfg.FabricListenPortMode)
}
if cfg.MeshListenAutoPortStart > 0 {
env = append(env, "RAP_MESH_LISTEN_AUTO_PORT_START="+strconv.Itoa(cfg.MeshListenAutoPortStart))
if cfg.FabricListenAutoPortStart > 0 {
env = append(env, "RAP_FABRIC_LISTEN_AUTO_PORT_START="+strconv.Itoa(cfg.FabricListenAutoPortStart))
}
if cfg.MeshListenAutoPortEnd > 0 {
env = append(env, "RAP_MESH_LISTEN_AUTO_PORT_END="+strconv.Itoa(cfg.MeshListenAutoPortEnd))
if cfg.FabricListenAutoPortEnd > 0 {
env = append(env, "RAP_FABRIC_LISTEN_AUTO_PORT_END="+strconv.Itoa(cfg.FabricListenAutoPortEnd))
}
if cfg.MeshAdvertiseEndpoint != "" {
env = append(env, "RAP_MESH_ADVERTISE_ENDPOINT="+cfg.MeshAdvertiseEndpoint)
@@ -306,6 +308,15 @@ func NodeAgentEnvWithStateDir(cfg RuntimeConfig, stateDir string) []string {
if cfg.MeshNATType != "" {
env = append(env, "RAP_MESH_NAT_TYPE="+cfg.MeshNATType)
}
if cfg.MeshSiteID != "" {
env = append(env, "RAP_MESH_SITE_ID="+cfg.MeshSiteID)
}
if cfg.MeshLocalityGroupID != "" {
env = append(env, "RAP_MESH_LOCALITY_GROUP_ID="+cfg.MeshLocalityGroupID)
}
if cfg.MeshNATGroupID != "" {
env = append(env, "RAP_MESH_NAT_GROUP_ID="+cfg.MeshNATGroupID)
}
if cfg.MeshRegion != "" {
env = append(env, "RAP_MESH_REGION="+cfg.MeshRegion)
}
@@ -2,14 +2,19 @@ package hostagent
import (
"context"
"crypto/ed25519"
cryptorand "crypto/rand"
"encoding/base64"
"encoding/json"
"fmt"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
"time"
clusterauth "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh"
)
type recordingRunner struct {
@@ -48,6 +53,27 @@ type imagePresentRunner struct {
calls [][]string
}
type inspectRuntimeRunner struct {
output string
}
func (r *inspectRuntimeRunner) Run(_ context.Context, name string, args ...string) (string, error) {
if name == "docker" && len(args) >= 2 && args[0] == "inspect" {
return r.output, nil
}
return "", nil
}
func testFabricRuntimeConfig() RuntimeConfig {
return RuntimeConfig{
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
}
}
func (r *imagePresentRunner) Run(_ context.Context, name string, args ...string) (string, error) {
r.calls = append(r.calls, append([]string{name}, args...))
if len(args) > 0 && args[0] == "run" {
@@ -58,21 +84,21 @@ func (r *imagePresentRunner) Run(_ context.Context, name string, args ...string)
func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
args := DockerRunArgs(RuntimeConfig{
BackendURL: "http://control/api/v1/",
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "/srv/rap/node-a",
MeshSyntheticRuntimeEnabled: true,
FabricRuntimeEnabled: true,
VPNFabricSessionTransportEnabled: true,
MeshQUICFabricEnabled: true,
MeshQUICFabricListenAddr: ":19443",
VPNFabricSessionStreamShards: 6,
VPNFabricQUICMaxStreamsPerConn: 24,
VPNFabricQUICIdleTTLSeconds: 120,
MeshListenAddr: ":19131",
FabricListenAddr: ":19131",
MeshAdvertiseEndpoint: "quic://10.0.0.11:19443/",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
MeshAdvertiseTransport: "direct_quic",
@@ -83,19 +109,19 @@ func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
for _, want := range []string{
"run", "-d", "--name\x00rap-node-agent-node-a", "--network\x00host",
"-v\x00/srv/rap/node-a:/var/lib/rap-node-agent",
"RAP_BACKEND_URL=http://control/api/v1",
"RAP_CLUSTER_ID=cluster-1",
"RAP_CLUSTER_AUTHORITY_PUBLIC_KEY=authority-key-b64",
"RAP_JOIN_TOKEN=join-secret",
"RAP_NODE_STATE_DIR=/var/lib/rap-node-agent",
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS=0",
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true",
"RAP_FABRIC_RUNTIME_ENABLED=true",
"RAP_VPN_FABRIC_SESSION_TRANSPORT_ENABLED=true",
"RAP_MESH_QUIC_FABRIC_ENABLED=true",
"RAP_MESH_QUIC_FABRIC_LISTEN_ADDR=:19443",
"RAP_VPN_FABRIC_SESSION_STREAM_SHARDS=6",
"RAP_VPN_FABRIC_QUIC_MAX_STREAMS_PER_CONN=24",
"RAP_VPN_FABRIC_QUIC_IDLE_TTL_SECONDS=120",
"RAP_MESH_LISTEN_ADDR=:19131",
"RAP_FABRIC_LISTEN_ADDR=:19131",
"RAP_MESH_ADVERTISE_ENDPOINT=quic://10.0.0.11:19443",
`RAP_FABRIC_REGISTRY_RECORDS_JSON=[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
"RAP_MESH_ADVERTISE_TRANSPORT=direct_quic",
@@ -110,7 +136,6 @@ func TestDockerRunArgsBuildNodeRuntimePlacement(t *testing.T) {
func TestDockerRunArgsEnableVPNGatewayDevice(t *testing.T) {
args := DockerRunArgs(RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
@@ -130,6 +155,40 @@ func TestDockerRunArgsEnableVPNGatewayDevice(t *testing.T) {
}
}
func TestRuntimeConfigFromContainerReadsFabricListenEnv(t *testing.T) {
runner := &inspectRuntimeRunner{output: fmt.Sprintf(`[{
"Config":{
"Image":"rap-node-agent:test",
"Env":[
"RAP_CLUSTER_ID=cluster-1",
"RAP_NODE_NAME=node-a",
"RAP_FABRIC_LISTEN_ADDR=:19131",
"RAP_FABRIC_LISTEN_PORT_MODE=auto",
"RAP_FABRIC_LISTEN_AUTO_PORT_START=19131",
"RAP_FABRIC_LISTEN_AUTO_PORT_END=19231"
]
},
"HostConfig":{
"NetworkMode":"host",
"RestartPolicy":{"Name":"unless-stopped"},
"CapAdd":[],
"Devices":[],
"Privileged":false
},
"Mounts":[{"Source":"/srv/rap/node-a","Destination":"/var/lib/rap-node-agent"}]
}]`)}
_, cfg, err := (DockerManager{}).runtimeConfigFromContainer(context.Background(), runner, "docker", "rap-node-agent-node-a")
if err != nil {
t.Fatalf("runtime config from container: %v", err)
}
if cfg.FabricListenAddr != ":19131" || cfg.FabricListenPortMode != "auto" {
t.Fatalf("fabric listen env was not read: %+v", cfg)
}
if cfg.FabricListenAutoPortStart != 19131 || cfg.FabricListenAutoPortEnd != 19231 {
t.Fatalf("fabric listen auto range was not read: %+v", cfg)
}
}
func TestPrepareStateDirCreatesWritableHostPath(t *testing.T) {
dir := filepath.Join(t.TempDir(), "node-state")
if err := PrepareStateDir(dir); err != nil {
@@ -153,92 +212,23 @@ func TestPrepareStateDirSkipsNamedVolume(t *testing.T) {
}
}
func TestFetchDockerInstallProfileBuildsRuntimeConfig(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/api/v1/node-agents/docker-install-profile" {
t.Fatalf("path = %s", r.URL.Path)
}
_ = json.NewEncoder(w).Encode(map[string]any{
"docker_install_profile": map[string]any{
"cluster_id": "cluster-1",
"backend_url": "https://control.example.test/api/v1",
"join_token": "rap_join_profile",
"node_name": "node-a",
"image": "rap-node-agent:test",
"artifact_endpoints": []string{"https://cache.example.test/artifacts"},
"fabric_registry_records": []map[string]any{{
"schema": "rap.fabric.registry.gossip_record.v1",
"service_class": "control-api",
"service_id": "control-a",
}},
"docker_image_artifact": map[string]any{
"kind": "docker_image_tar",
"image": "rap-node-agent:test",
"file_name": "rap-node-agent-test.tar",
"size_bytes": 21,
},
"container_name": "rap-node-agent-node-a",
"state_dir": "/var/lib/rap/nodes/node-a",
"network": "host",
"restart_policy": "unless-stopped",
"replace": true,
"mesh_synthetic_runtime_enabled": true,
"vpn_fabric_session_transport_enabled": true,
"mesh_quic_fabric_enabled": true,
"mesh_quic_fabric_listen_addr": ":19443",
"vpn_fabric_session_stream_shards": 6,
"mesh_connectivity_mode": "outbound_only",
},
})
}))
defer server.Close()
profile, err := FetchDockerInstallProfile(context.Background(), ProfileRequest{
URL: server.URL + "/api/v1",
ClusterID: "cluster-1",
InstallToken: "rap_join_profile",
NodeName: "node-a",
})
if err != nil {
t.Fatalf("fetch profile: %v", err)
}
cfg := RuntimeConfigFromProfile(profile).Normalize()
if cfg.BackendURL != "https://control.example.test/api/v1" ||
cfg.ClusterID != "cluster-1" ||
cfg.JoinToken != "rap_join_profile" ||
cfg.ContainerName != "rap-node-agent-node-a" ||
len(cfg.ImageArtifactURLs) != 1 ||
cfg.ImageArtifactSizeBytes != 21 ||
!cfg.MeshSyntheticRuntimeEnabled ||
!cfg.VPNFabricSessionTransportEnabled ||
!cfg.MeshQUICFabricEnabled ||
cfg.MeshQUICFabricListenAddr != ":19443" ||
cfg.VPNFabricSessionStreamShards != 6 ||
cfg.FabricRegistryRecordsJSON != `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api","service_id":"control-a"}]` ||
cfg.MeshConnectivityMode != "outbound_only" {
t.Fatalf("unexpected cfg: %+v", cfg)
}
}
func TestInstallLoadsImageArtifactWhenImageMissing(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("fake docker image tar"))
}))
defer server.Close()
artifactPath := writeDockerImageArtifact(t, "fake docker image tar")
runner := &imageMissingRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{artifactPath},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
})
if err != nil {
t.Fatalf("install: %v", err)
@@ -255,24 +245,22 @@ func TestInstallLoadsImageArtifactWhenImageMissing(t *testing.T) {
func TestInstallAcceptsSizeMismatchWhenChecksumMissing(t *testing.T) {
const payload = "fake docker image tar"
const wrongSize = 999
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte(payload))
}))
defer server.Close()
artifactPath := writeDockerImageArtifact(t, payload)
runner := &imageMissingRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
ImageArtifactSHA256: "", // intentionally absent -> size mismatch should not block install
ImageArtifactSizeBytes: wrongSize,
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{artifactPath},
ImageArtifactSHA256: "", // intentionally absent -> size mismatch should not block install
ImageArtifactSizeBytes: wrongSize,
})
if err != nil {
t.Fatalf("install: %v", err)
@@ -283,24 +271,22 @@ func TestInstallAcceptsSizeMismatchWhenChecksumMissing(t *testing.T) {
}
func TestInstallReloadsImageArtifactWhenReplacingMutableTag(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte("fake docker image tar"))
}))
defer server.Close()
artifactPath := writeDockerImageArtifact(t, "fake docker image tar")
runner := &imagePresentRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{artifactPath},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
})
if err != nil {
t.Fatalf("install: %v", err)
@@ -315,27 +301,22 @@ func TestInstallReloadsImageArtifactWhenReplacingMutableTag(t *testing.T) {
}
func TestDockerInstallLoadsExplicitArtifactBeforeReplace(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/rap-node-agent-test.tar" {
t.Fatalf("unexpected path %s", r.URL.Path)
}
_, _ = w.Write([]byte("fake docker image tar"))
}))
defer server.Close()
artifactPath := writeDockerImageArtifact(t, "fake docker image tar")
runner := &imageMissingRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{server.URL + "/rap-node-agent-test.tar"},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
Image: "rap-node-agent:test",
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
Replace: true,
ImageArtifactURLs: []string{artifactPath},
ImageArtifactSHA256: "5c2fbd41c87e83dc372690e8e1244b98baf8aded64870b369c28c4b313e15cc2",
ImageArtifactSizeBytes: 21,
})
if err != nil {
t.Fatalf("install: %v", err)
@@ -349,6 +330,15 @@ func TestDockerInstallLoadsExplicitArtifactBeforeReplace(t *testing.T) {
}
}
func writeDockerImageArtifact(t *testing.T, payload string) string {
t.Helper()
path := filepath.Join(t.TempDir(), "rap-node-agent-test.tar")
if err := os.WriteFile(path, []byte(payload), 0o600); err != nil {
t.Fatalf("write artifact: %v", err)
}
return path
}
func flattenCalls(calls [][]string) []string {
out := []string{}
for _, call := range calls {
@@ -360,14 +350,15 @@ func flattenCalls(calls [][]string) []string {
func TestInstallCanPullReplaceAndRedactsJoinToken(t *testing.T) {
runner := &recordingRunner{}
result, err := (DockerManager{Runner: runner}).Install(context.Background(), RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
PullImage: true,
Replace: true,
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
PullImage: true,
Replace: true,
ContainerName: "rap-node-agent-node-a",
StateDir: "rap-node-state",
})
if err != nil {
t.Fatalf("install: %v", err)
@@ -385,44 +376,350 @@ func TestInstallCanPullReplaceAndRedactsJoinToken(t *testing.T) {
}
func TestValidateRequiresJoinTokenUnlessReplacingExistingState(t *testing.T) {
err := RuntimeConfig{BackendURL: "http://control/api/v1", ClusterID: "cluster-1", NodeName: "node-a"}.ValidateInstall()
err := RuntimeConfig{ClusterID: "cluster-1", ClusterAuthorityPublicKey: "authority-key-b64", FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`, NodeName: "node-a"}.ValidateInstall()
if err == nil || !strings.Contains(err.Error(), "join-token") {
t.Fatalf("expected join token validation error, got %v", err)
}
err = RuntimeConfig{BackendURL: "http://control/api/v1", ClusterID: "cluster-1", NodeName: "node-a", Replace: true}.ValidateInstall()
err = RuntimeConfig{ClusterID: "cluster-1", ClusterAuthorityPublicKey: "authority-key-b64", FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`, NodeName: "node-a", Replace: true}.ValidateInstall()
if err != nil {
t.Fatalf("replace update should allow missing join token: %v", err)
}
}
func TestValidateRejectsLegacyMeshAdvertiseTransport(t *testing.T) {
func TestValidateAllowsFabricBootstrapWithoutBackendURL(t *testing.T) {
err := RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
MeshAdvertiseEndpoint: "quic://10.0.0.11:19443",
MeshAdvertiseTransport: "wss",
MeshQUICFabricEnabled: true,
MeshQUICFabricListenAddr: ":19443",
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
}.ValidateInstall()
if err != nil {
t.Fatalf("fabric-native install should validate: %v", err)
}
}
func TestValidateRequiresAuthorityKeyForFabricBootstrap(t *testing.T) {
err := RuntimeConfig{
ClusterID: "cluster-1",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
}.ValidateInstall()
if err == nil || !strings.Contains(err.Error(), "cluster-authority-public-key") {
t.Fatalf("expected authority key validation error, got %v", err)
}
}
func TestLoadDockerJoinBundleRejectsUnsignedEnvelope(t *testing.T) {
path := filepath.Join(t.TempDir(), "bundle.json")
if err := os.WriteFile(path, []byte(`{
"docker_install_profile": {
"cluster_id": "cluster-1",
"cluster_authority_public_key": "authority-key-b64",
"join_token": "join-secret",
"node_name": "node-a",
"fabric_registry_records": [{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]
}
}`), 0o600); err != nil {
t.Fatalf("write bundle: %v", err)
}
_, err := LoadDockerJoinBundle(path)
if err == nil || !strings.Contains(err.Error(), "join bundle authority envelope is missing") {
t.Fatalf("expected unsigned bundle error, got %v", err)
}
}
func TestLoadDockerJoinBundleVerifiesAuthoritySignature(t *testing.T) {
publicKey, privateKey, err := ed25519.GenerateKey(cryptorand.Reader)
if err != nil {
t.Fatalf("GenerateKey: %v", err)
}
signedProfile := map[string]any{
"cluster_id": "cluster-1",
"cluster_authority_public_key": base64.StdEncoding.EncodeToString(publicKey),
"join_token": "join-secret",
"node_name": "node-a",
}
authorityPayload, err := json.Marshal(map[string]any{
"docker_install_profile": signedProfile,
})
if err != nil {
t.Fatalf("marshal authority payload: %v", err)
}
canonical, err := clusterauth.CanonicalJSON(authorityPayload)
if err != nil {
t.Fatalf("CanonicalJSON: %v", err)
}
signed := ed25519.Sign(privateKey, canonical)
path := filepath.Join(t.TempDir(), "bundle.json")
if err := os.WriteFile(path, []byte(fmt.Sprintf(`{
"schema_version": "rap.install_join_bundle.v1",
"bundle_kind": "docker",
"cluster_id": "cluster-1",
"cluster_authority": {
"schema_version": "%s",
"cluster_id": "cluster-1",
"authority_state": "active",
"key_algorithm": "%s",
"public_key": "%s",
"public_key_fingerprint": "%s",
"created_at": "%s",
"updated_at": "%s"
},
"authority_payload": %s,
"authority_signature": {
"schema_version": "%s",
"algorithm": "%s",
"key_fingerprint": "%s",
"signature": "%s",
"signed_at": "%s"
},
"docker_install_profile": %s
}`, clusterauth.AuthoritySchemaVersion, clusterauth.AlgorithmEd25519, base64.StdEncoding.EncodeToString(publicKey), clusterauth.Fingerprint(publicKey), time.Now().UTC().Format(time.RFC3339), time.Now().UTC().Format(time.RFC3339), string(authorityPayload), clusterauth.SignatureSchemaVersion, clusterauth.AlgorithmEd25519, clusterauth.Fingerprint(publicKey), base64.StdEncoding.EncodeToString(signed), time.Now().UTC().Format(time.RFC3339), mustBundleJSON(t, signedProfile))), 0o600); err != nil {
t.Fatalf("write bundle: %v", err)
}
loaded, err := LoadDockerJoinBundle(path)
if err != nil {
t.Fatalf("LoadDockerJoinBundle: %v", err)
}
if loaded.NodeName != "node-a" {
t.Fatalf("unexpected loaded profile: %+v", loaded)
}
}
func TestLoadDockerJoinBundleRejectsTamperedSignedProfile(t *testing.T) {
publicKey, privateKey, err := ed25519.GenerateKey(cryptorand.Reader)
if err != nil {
t.Fatalf("GenerateKey: %v", err)
}
signedProfile := map[string]any{
"cluster_id": "cluster-1",
"cluster_authority_public_key": base64.StdEncoding.EncodeToString(publicKey),
"join_token": "join-secret",
"node_name": "node-a",
}
tamperedProfile := map[string]any{
"cluster_id": "cluster-1",
"cluster_authority_public_key": base64.StdEncoding.EncodeToString(publicKey),
"join_token": "join-secret",
"node_name": "node-b",
}
authorityPayload, err := json.Marshal(map[string]any{
"docker_install_profile": signedProfile,
})
if err != nil {
t.Fatalf("marshal authority payload: %v", err)
}
canonical, err := clusterauth.CanonicalJSON(authorityPayload)
if err != nil {
t.Fatalf("CanonicalJSON: %v", err)
}
signed := ed25519.Sign(privateKey, canonical)
path := filepath.Join(t.TempDir(), "bundle.json")
if err := os.WriteFile(path, []byte(fmt.Sprintf(`{
"schema_version": "rap.install_join_bundle.v1",
"bundle_kind": "docker",
"cluster_id": "cluster-1",
"cluster_authority": {
"schema_version": "%s",
"cluster_id": "cluster-1",
"authority_state": "active",
"key_algorithm": "%s",
"public_key": "%s",
"public_key_fingerprint": "%s",
"created_at": "%s",
"updated_at": "%s"
},
"authority_payload": %s,
"authority_signature": {
"schema_version": "%s",
"algorithm": "%s",
"key_fingerprint": "%s",
"signature": "%s",
"signed_at": "%s"
},
"docker_install_profile": %s
}`, clusterauth.AuthoritySchemaVersion, clusterauth.AlgorithmEd25519, base64.StdEncoding.EncodeToString(publicKey), clusterauth.Fingerprint(publicKey), time.Now().UTC().Format(time.RFC3339), time.Now().UTC().Format(time.RFC3339), string(authorityPayload), clusterauth.SignatureSchemaVersion, clusterauth.AlgorithmEd25519, clusterauth.Fingerprint(publicKey), base64.StdEncoding.EncodeToString(signed), time.Now().UTC().Format(time.RFC3339), mustBundleJSON(t, tamperedProfile))), 0o600); err != nil {
t.Fatalf("write bundle: %v", err)
}
_, err = LoadDockerJoinBundle(path)
if err == nil || !strings.Contains(err.Error(), "does not match signed authority payload") {
t.Fatalf("expected signed bundle mismatch error, got %v", err)
}
}
func TestLoadDockerJoinBundleRejectsSignedProfileAuthorityKeyMismatch(t *testing.T) {
publicKey, privateKey, err := ed25519.GenerateKey(cryptorand.Reader)
if err != nil {
t.Fatalf("GenerateKey: %v", err)
}
otherPublicKey, _, err := ed25519.GenerateKey(cryptorand.Reader)
if err != nil {
t.Fatalf("GenerateKey(other): %v", err)
}
signedProfile := map[string]any{
"cluster_id": "cluster-1",
"cluster_authority_public_key": base64.StdEncoding.EncodeToString(otherPublicKey),
"join_token": "join-secret",
"node_name": "node-a",
}
authorityPayload, err := json.Marshal(map[string]any{
"docker_install_profile": signedProfile,
})
if err != nil {
t.Fatalf("marshal authority payload: %v", err)
}
canonical, err := clusterauth.CanonicalJSON(authorityPayload)
if err != nil {
t.Fatalf("CanonicalJSON: %v", err)
}
signed := ed25519.Sign(privateKey, canonical)
path := filepath.Join(t.TempDir(), "bundle.json")
if err := os.WriteFile(path, []byte(fmt.Sprintf(`{
"schema_version": "rap.install_join_bundle.v1",
"bundle_kind": "docker",
"cluster_id": "cluster-1",
"cluster_authority": {
"schema_version": "%s",
"cluster_id": "cluster-1",
"authority_state": "active",
"key_algorithm": "%s",
"public_key": "%s",
"public_key_fingerprint": "%s",
"created_at": "%s",
"updated_at": "%s"
},
"authority_payload": %s,
"authority_signature": {
"schema_version": "%s",
"algorithm": "%s",
"key_fingerprint": "%s",
"signature": "%s",
"signed_at": "%s"
},
"docker_install_profile": %s
}`, clusterauth.AuthoritySchemaVersion, clusterauth.AlgorithmEd25519, base64.StdEncoding.EncodeToString(publicKey), clusterauth.Fingerprint(publicKey), time.Now().UTC().Format(time.RFC3339), time.Now().UTC().Format(time.RFC3339), string(authorityPayload), clusterauth.SignatureSchemaVersion, clusterauth.AlgorithmEd25519, clusterauth.Fingerprint(publicKey), base64.StdEncoding.EncodeToString(signed), time.Now().UTC().Format(time.RFC3339), mustBundleJSON(t, signedProfile))), 0o600); err != nil {
t.Fatalf("write bundle: %v", err)
}
_, err = LoadDockerJoinBundle(path)
if err == nil || !strings.Contains(err.Error(), "profile authority key does not match signed bundle authority key") {
t.Fatalf("expected authority key mismatch error, got %v", err)
}
}
func TestLoadDockerJoinBundleRejectsSignedProfileClusterIDMismatch(t *testing.T) {
publicKey, privateKey, err := ed25519.GenerateKey(cryptorand.Reader)
if err != nil {
t.Fatalf("GenerateKey: %v", err)
}
signedProfile := map[string]any{
"cluster_id": "cluster-2",
"cluster_authority_public_key": base64.StdEncoding.EncodeToString(publicKey),
"join_token": "join-secret",
"node_name": "node-a",
}
authorityPayload, err := json.Marshal(map[string]any{
"docker_install_profile": signedProfile,
})
if err != nil {
t.Fatalf("marshal authority payload: %v", err)
}
canonical, err := clusterauth.CanonicalJSON(authorityPayload)
if err != nil {
t.Fatalf("CanonicalJSON: %v", err)
}
signed := ed25519.Sign(privateKey, canonical)
path := filepath.Join(t.TempDir(), "bundle.json")
if err := os.WriteFile(path, []byte(fmt.Sprintf(`{
"schema_version": "rap.install_join_bundle.v1",
"bundle_kind": "docker",
"cluster_id": "cluster-1",
"cluster_authority": {
"schema_version": "%s",
"cluster_id": "cluster-1",
"authority_state": "active",
"key_algorithm": "%s",
"public_key": "%s",
"public_key_fingerprint": "%s",
"created_at": "%s",
"updated_at": "%s"
},
"authority_payload": %s,
"authority_signature": {
"schema_version": "%s",
"algorithm": "%s",
"key_fingerprint": "%s",
"signature": "%s",
"signed_at": "%s"
},
"docker_install_profile": %s
}`, clusterauth.AuthoritySchemaVersion, clusterauth.AlgorithmEd25519, base64.StdEncoding.EncodeToString(publicKey), clusterauth.Fingerprint(publicKey), time.Now().UTC().Format(time.RFC3339), time.Now().UTC().Format(time.RFC3339), string(authorityPayload), clusterauth.SignatureSchemaVersion, clusterauth.AlgorithmEd25519, clusterauth.Fingerprint(publicKey), base64.StdEncoding.EncodeToString(signed), time.Now().UTC().Format(time.RFC3339), mustBundleJSON(t, signedProfile))), 0o600); err != nil {
t.Fatalf("write bundle: %v", err)
}
_, err = LoadDockerJoinBundle(path)
if err == nil || !strings.Contains(err.Error(), "profile cluster_id does not match signed bundle cluster_id") {
t.Fatalf("expected cluster mismatch error, got %v", err)
}
}
func mustBundleJSON(t *testing.T, value any) string {
t.Helper()
raw, err := json.Marshal(value)
if err != nil {
t.Fatalf("marshal bundle json: %v", err)
}
return string(raw)
}
func TestValidateRejectsDisallowedMeshAdvertiseTransport(t *testing.T) {
err := RuntimeConfig{
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
MeshAdvertiseEndpoint: "quic://10.0.0.11:19443",
MeshAdvertiseTransport: "wss",
MeshQUICFabricEnabled: true,
MeshQUICFabricListenAddr: ":19443",
}.ValidateInstall()
if err == nil || !strings.Contains(err.Error(), "QUIC transport") {
t.Fatalf("expected QUIC transport validation error, got %v", err)
}
}
func TestValidateRejectsLegacyMeshAdvertiseEndpointScheme(t *testing.T) {
func TestValidateRejectsDisallowedMeshAdvertiseEndpointScheme(t *testing.T) {
err := RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
JoinToken: "join-secret",
NodeName: "node-a",
MeshAdvertiseEndpoint: "http://10.0.0.11:19131",
MeshAdvertiseTransport: "direct_quic",
MeshQUICFabricEnabled: true,
MeshQUICFabricListenAddr: ":19443",
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1"}]`,
JoinToken: "join-secret",
NodeName: "node-a",
MeshAdvertiseEndpoint: "http://10.0.0.11:19131",
MeshAdvertiseTransport: "direct_quic",
MeshQUICFabricEnabled: true,
MeshQUICFabricListenAddr: ":19443",
}.ValidateInstall()
if err == nil || !strings.Contains(err.Error(), "QUIC endpoint") {
t.Fatalf("expected QUIC endpoint validation error, got %v", err)
}
}
func TestPreferredUpdateServiceEndpointsPrioritizesHintOrder(t *testing.T) {
input := []mesh.FabricRegistryEndpoint{
{EndpointID: "ep-1", Address: "quic://10.0.0.5:19443"},
{EndpointID: "ep-2", Address: "quic://10.0.0.6:19443"},
{EndpointID: "ep-3", Address: "quic://10.0.0.7:19443"},
}
ordered := preferredUpdateServiceEndpoints(input, []string{
"quic://10.0.0.7:19443",
"quic://10.0.0.5:19443",
})
if len(ordered) != 3 {
t.Fatalf("ordered len = %d", len(ordered))
}
if ordered[0].Address != "quic://10.0.0.7:19443" || ordered[1].Address != "quic://10.0.0.5:19443" {
t.Fatalf("unexpected preferred ordering: %+v", ordered)
}
}
+221 -28
View File
@@ -2,6 +2,7 @@ package hostagent
import (
"context"
"encoding/json"
"errors"
"fmt"
"os"
@@ -47,6 +48,7 @@ type LinuxInstallResult struct {
NodeAgentPath string
HostAgentPath string
EnvPath string
UpdaterEnvPath string
UnitName string
UnitPath string
UpdaterUnitName string
@@ -64,13 +66,14 @@ func LinuxInstallConfigFromProfile(profile LinuxInstallProfile) LinuxInstallConf
installDir := firstNonEmpty(profile.InstallDir, filepath.Join(DefaultLinuxInstallRoot, safeUnitSlug(profile.NodeName)))
return LinuxInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: profile.BackendURL,
ClusterAuthorityPublicKey: strings.TrimSpace(profile.ClusterAuthorityPublicKey),
FabricRegistryRecordsJSON: strings.TrimSpace(string(profile.FabricRegistryRecords)),
ClusterID: profile.ClusterID,
JoinToken: profile.JoinToken,
NodeName: profile.NodeName,
StateDir: stateDir,
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
FabricRuntimeEnabled: profile.FabricRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled,
MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled,
@@ -78,15 +81,18 @@ func LinuxInstallConfigFromProfile(profile LinuxInstallProfile) LinuxInstallConf
VPNFabricSessionStreamShards: profile.VPNFabricSessionStreamShards,
VPNFabricQUICMaxStreamsPerConn: profile.VPNFabricQUICMaxStreamsPerConn,
VPNFabricQUICIdleTTLSeconds: profile.VPNFabricQUICIdleTTLSeconds,
MeshListenAddr: profile.MeshListenAddr,
MeshListenPortMode: profile.MeshListenPortMode,
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
FabricListenAddr: profile.FabricListenAddr,
FabricListenPortMode: profile.FabricListenPortMode,
FabricListenAutoPortStart: profile.FabricListenAutoPortStart,
FabricListenAutoPortEnd: profile.FabricListenAutoPortEnd,
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
MeshConnectivityMode: profile.MeshConnectivityMode,
MeshNATType: profile.MeshNATType,
MeshSiteID: profile.MeshSiteID,
MeshLocalityGroupID: firstNonEmpty(profile.MeshLocalityGroupID, profile.MeshSiteID),
MeshNATGroupID: profile.MeshNATGroupID,
MeshRegion: profile.MeshRegion,
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
@@ -152,15 +158,16 @@ func (m LinuxManager) Install(ctx context.Context, cfg LinuxInstallConfig) (Linu
cfg.StartupMode = strings.ToLower(firstNonEmpty(cfg.StartupMode, "systemd"))
unitName := "rap-node-agent-" + slug + ".service"
result := LinuxInstallResult{
NodeName: cfg.RuntimeConfig.NodeName,
InstallDir: cfg.InstallDir,
StateDir: cfg.StateDir,
ConfigDir: cfg.ConfigDir,
NodeAgentPath: filepath.Join(cfg.InstallDir, "rap-node-agent"),
HostAgentPath: filepath.Join(cfg.InstallDir, "rap-host-agent"),
EnvPath: filepath.Join(cfg.ConfigDir, "rap-node-agent.env"),
UnitName: unitName,
UnitPath: filepath.Join(cfg.UnitDir, unitName),
NodeName: cfg.RuntimeConfig.NodeName,
InstallDir: cfg.InstallDir,
StateDir: cfg.StateDir,
ConfigDir: cfg.ConfigDir,
NodeAgentPath: filepath.Join(cfg.InstallDir, "rap-node-agent"),
HostAgentPath: filepath.Join(cfg.InstallDir, "rap-host-agent"),
EnvPath: filepath.Join(cfg.ConfigDir, "rap-node-agent.env"),
UpdaterEnvPath: filepath.Join(cfg.ConfigDir, "rap-host-agent-updater.env"),
UnitName: unitName,
UnitPath: filepath.Join(cfg.UnitDir, unitName),
}
if cfg.DryRun {
return result, nil
@@ -273,7 +280,7 @@ func installLinuxHostAgentUpdater(ctx context.Context, m LinuxManager, result Li
}
interval := cfg.AutoUpdateIntervalSeconds
if interval == 0 {
interval = 21600
interval = DefaultUpdateIntervalSec
}
initialDelay := cfg.AutoUpdateInitialDelaySeconds
if initialDelay == 0 {
@@ -301,16 +308,16 @@ func installLinuxHostAgentUpdater(ctx context.Context, m LinuxManager, result Li
"--host-agent-current-version", firstNonEmpty(cfg.AutoUpdateCurrentVersion, "0.0.0"),
"--host-agent-binary-path", result.HostAgentPath,
}
if strings.TrimSpace(cfg.RuntimeConfig.BackendURL) != "" {
args = append(args, "--backend-url", strings.TrimSpace(cfg.RuntimeConfig.BackendURL))
}
args = appendFabricUpdateArgs(args, cfg.RuntimeConfig)
args = appendFabricUpdateArgs(args, cfg.RuntimeConfig, false)
if strings.TrimSpace(cfg.NodeID) != "" {
args = append(args, "--node-id", strings.TrimSpace(cfg.NodeID))
}
if strings.TrimSpace(cfg.AutoUpdateChannel) != "" {
args = append(args, "--channel", strings.TrimSpace(cfg.AutoUpdateChannel))
}
if err := os.WriteFile(result.UpdaterEnvPath, []byte(linuxHostAgentUpdaterEnvFile(cfg.RuntimeConfig)), 0o600); err != nil {
return result, err
}
unitName := "rap-host-agent-updater-" + safeUnitSlug(result.NodeName) + ".service"
unitPath := filepath.Join(firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir), unitName)
unit := fmt.Sprintf(`[Unit]
@@ -320,13 +327,14 @@ Wants=network-online.target
[Service]
Type=simple
EnvironmentFile=%s
ExecStart=%s
Restart=always
RestartSec=30
[Install]
WantedBy=multi-user.target
`, result.NodeName, result.UnitName, systemdJoin(args))
`, result.NodeName, result.UnitName, systemdQuote(result.UpdaterEnvPath), systemdJoin(args))
if err := os.WriteFile(unitPath, []byte(unit), 0o644); err != nil {
return result, err
}
@@ -359,12 +367,22 @@ func (m LinuxManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Updat
result := UpdateResult{Action: plan.Action, Reason: plan.Reason, TargetVersion: plan.TargetVersion, ContainerName: req.SystemdUnitName, NewImage: req.BinaryPath}
if plan.Action != "update" {
if !req.DryRun {
restarted, err := rewriteLinuxControlPlaneRuntime(ctx, m.runner(), req, plan)
if err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "rewrite_runtime", "failed", err))
return result, err
}
result.RestartNeeded = restarted
}
if !req.DryRun {
_ = saveUpdatePlanState(req, plan, req.CurrentVersion, req.SystemdUnitName, req.BinaryPath)
status := statusFromNoopPlan(req, plan)
if status.Payload == nil {
status.Payload = map[string]any{}
}
status.Payload["systemd_unit"] = req.SystemdUnitName
status.Payload["binary_path"] = req.BinaryPath
status.Payload["restart_needed"] = result.RestartNeeded
_ = ReportNodeUpdateStatusForRequest(ctx, req, status)
}
return result, nil
@@ -387,14 +405,14 @@ func (m LinuxManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Updat
if req.DryRun {
return result, nil
}
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "download", Status: "started", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath}})
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "download", Status: "started", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"artifact_id": plan.Artifact.ID, "binary_path": req.BinaryPath, "transport": updateArtifactTransport(req, plan)}})
path, distributors, err := downloadUpdateArtifact(ctx, req, plan)
if err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "download", "failed", err))
return result, err
}
defer os.Remove(path)
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "download", Status: "succeeded", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"artifact_id": plan.Artifact.ID, "binary_path": req.BinaryPath, "fabric_distributors": distributors, "transport": updateArtifactTransport(req, plan)}})
runner := m.runner()
_, _ = runner.Run(ctx, "systemctl", "stop", req.SystemdUnitName)
if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
@@ -402,15 +420,183 @@ func (m LinuxManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Updat
return result, err
}
result.Replaced = true
if _, err := runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName); err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "restart", "failed", err))
restartedByRewrite, err := rewriteLinuxControlPlaneRuntime(ctx, runner, req, plan)
if err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "rewrite_runtime", "failed", err))
return result, err
}
result.RestartNeeded = restartedByRewrite
if !restartedByRewrite {
if _, err := runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName); err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "restart", "failed", err))
return result, err
}
}
if err := ensureLinuxUnitActive(ctx, runner, req.SystemdUnitName); err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "health_check", "failed", err))
return result, err
}
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{Product: req.Product, CurrentVersion: req.CurrentVersion, TargetVersion: plan.TargetVersion, Phase: "health_check", Status: "succeeded", AttemptID: updateAttemptID(plan), ObservedAt: time.Now().UTC(), Payload: map[string]any{"systemd_unit": req.SystemdUnitName, "binary_path": req.BinaryPath}})
_ = saveUpdateState(req.StateDir, UpdateState{Product: req.Product, CurrentVersion: plan.TargetVersion, TargetVersion: plan.TargetVersion, Image: req.BinaryPath, UpdatedAt: time.Now().UTC()})
_ = saveUpdatePlanState(req, plan, plan.TargetVersion, req.SystemdUnitName, req.BinaryPath)
return result, nil
}
func linuxHostAgentUpdaterEnvFile(cfg RuntimeConfig) string {
lines := []string{}
if registry := strings.TrimSpace(cfg.FabricRegistryRecordsJSON); registry != "" {
lines = append(lines, "RAP_FABRIC_REGISTRY_RECORDS_JSON="+systemdQuote(registry))
}
if len(lines) == 0 {
return ""
}
return strings.Join(lines, "\n") + "\n"
}
func ensureLinuxUnitActive(ctx context.Context, runner CommandRunner, unitName string) error {
unitName = strings.TrimSpace(unitName)
if unitName == "" {
return nil
}
out, err := runner.Run(ctx, "systemctl", "is-active", unitName)
if err != nil {
return err
}
if strings.TrimSpace(out) != "active" {
return fmt.Errorf("systemd unit %s is not active: %s", unitName, strings.TrimSpace(out))
}
return nil
}
func rewriteLinuxControlPlaneRuntime(ctx context.Context, runner CommandRunner, req UpdateRequest, plan NodeUpdatePlan) (bool, error) {
_ = saveControlPlaneRuntimeState(req.StateDir, ControlPlaneRuntimeState{
SchemaVersion: "rap.control_plane_runtime_state.v1",
ClusterID: strings.TrimSpace(plan.ClusterID),
NodeID: strings.TrimSpace(plan.NodeID),
Product: strings.TrimSpace(plan.Product),
FabricRegistryRecords: append(json.RawMessage(nil), plan.FabricRegistryRecords...),
AuthorityPayload: append(json.RawMessage(nil), plan.AuthorityPayload...),
AuthoritySignature: append(json.RawMessage(nil), plan.AuthoritySignature...),
AuthorityQuorum: plan.AuthorityQuorum,
UpdatedAt: time.Now().UTC(),
})
slug := strings.TrimSuffix(strings.TrimSpace(req.SystemdUnitName), ".service")
slug = strings.TrimPrefix(slug, "rap-node-agent-")
if slug == "" {
return false, nil
}
envChanged := false
envPath := filepath.Join(DefaultLinuxConfigRoot, slug, "rap-node-agent.env")
wantRegistry := strings.TrimSpace(string(plan.FabricRegistryRecords))
if wantRegistry != "" && fileExists(envPath) {
current, err := os.ReadFile(envPath)
if err != nil {
return false, err
}
updatedEnv := string(current)
updatedEnv = upsertEnvFileValue(updatedEnv, "RAP_FABRIC_REGISTRY_RECORDS_JSON", wantRegistry)
if updatedEnv != string(current) {
if err := os.WriteFile(envPath, []byte(updatedEnv), 0o600); err != nil {
return false, err
}
envChanged = true
}
}
updaterUnitName := "rap-host-agent-updater-" + safeUnitSlug(slug) + ".service"
updaterUnitPath := filepath.Join(DefaultSystemdUnitDir, updaterUnitName)
updaterEnvPath := filepath.Join(DefaultLinuxConfigRoot, slug, "rap-host-agent-updater.env")
if wantRegistry != "" {
current := ""
if fileExists(updaterEnvPath) {
payload, err := os.ReadFile(updaterEnvPath)
if err != nil {
return false, err
}
current = string(payload)
}
updatedEnv := upsertEnvFileValue(current, "RAP_FABRIC_REGISTRY_RECORDS_JSON", wantRegistry)
if updatedEnv != current {
if err := os.MkdirAll(filepath.Dir(updaterEnvPath), 0o755); err != nil {
return false, err
}
if err := os.WriteFile(updaterEnvPath, []byte(updatedEnv), 0o600); err != nil {
return false, err
}
envChanged = true
}
}
if wantRegistry == "" {
if envChanged && strings.TrimSpace(req.SystemdUnitName) != "" {
_, _ = runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName)
}
return envChanged, nil
}
if !fileExists(updaterUnitPath) {
if envChanged && strings.TrimSpace(req.SystemdUnitName) != "" {
_, _ = runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName)
}
return envChanged, nil
}
current, err := os.ReadFile(updaterUnitPath)
if err != nil {
return false, err
}
updated := ensureSystemdEnvironmentFile(replaceCLIArg(string(current), "--fabric-registry-records-json", "", false), updaterEnvPath)
if updated == string(current) {
if envChanged && strings.TrimSpace(req.SystemdUnitName) != "" {
_, _ = runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName)
}
return envChanged, nil
}
if err := os.WriteFile(updaterUnitPath, []byte(updated), 0o644); err != nil {
return false, err
}
if _, err := runner.Run(ctx, "systemctl", "daemon-reload"); err != nil {
return false, err
}
_, _ = runner.Run(ctx, "systemctl", "restart", updaterUnitName)
if envChanged && strings.TrimSpace(req.SystemdUnitName) != "" {
_, _ = runner.Run(ctx, "systemctl", "restart", req.SystemdUnitName)
}
return true, nil
}
func ensureSystemdEnvironmentFile(unit string, envPath string) string {
envPath = strings.TrimSpace(envPath)
if envPath == "" || strings.Contains(unit, "EnvironmentFile=") {
return unit
}
line := "EnvironmentFile=" + systemdQuote(envPath)
if strings.Contains(unit, "Type=simple\n") {
return strings.Replace(unit, "Type=simple\n", "Type=simple\n"+line+"\n", 1)
}
if strings.Contains(unit, "[Service]\n") {
return strings.Replace(unit, "[Service]\n", "[Service]\n"+line+"\n", 1)
}
return unit
}
func upsertEnvFileValue(payload string, key string, value string) string {
prefix := key + "="
lines := strings.Split(payload, "\n")
for i, line := range lines {
rawLine := strings.TrimRight(line, "\r")
trimmed := strings.TrimSpace(rawLine)
if strings.HasPrefix(trimmed, prefix) {
if value == "" {
lines = append(lines[:i], lines[i+1:]...)
} else {
lines[i] = prefix + systemdQuote(value)
}
return strings.Join(lines, "\n")
}
}
if value == "" {
return payload
}
lines = append(lines, prefix+systemdQuote(value))
return strings.Join(lines, "\n")
}
func (m LinuxManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) error {
req := cfg.Request
req.InstallType = firstNonEmpty(req.InstallType, BinaryUpdateInstallType)
@@ -421,6 +607,9 @@ func (m LinuxManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) e
}
func runLinuxUpdateLoop(ctx context.Context, m LinuxManager, cfg UpdateLoopConfig) error {
if err := ReconcileSignedUpdateState(cfg.Request.StateDir); err != nil {
return err
}
if cfg.Interval == 0 {
cfg.Interval = time.Hour
}
@@ -450,6 +639,7 @@ func runLinuxUpdateLoop(ctx context.Context, m LinuxManager, cfg UpdateLoopConfi
continue
} else {
logf("linux_update_loop run=%d status=failed error=%v", runs, err)
saveUpdateLoopRescueState(cfg.Request, "linux_node_agent_update_failed", err)
if cfg.StopOnError {
return err
}
@@ -462,10 +652,12 @@ func runLinuxUpdateLoop(ctx context.Context, m LinuxManager, cfg UpdateLoopConfi
}
if cfg.HostAgentUpdateEnabled {
hostReq := cfg.HostAgentUpdateRequest
hostReq.BackendURL = firstNonEmpty(hostReq.BackendURL, cfg.Request.BackendURL)
hostReq.ClusterID = firstNonEmpty(hostReq.ClusterID, cfg.Request.ClusterID)
hostReq.NodeID = firstNonEmpty(hostReq.NodeID, cfg.Request.NodeID)
hostReq.StateDir = firstNonEmpty(hostReq.StateDir, cfg.Request.StateDir)
hostReq.ClusterAuthorityPublicKey = firstNonEmpty(hostReq.ClusterAuthorityPublicKey, cfg.Request.ClusterAuthorityPublicKey)
hostReq.FabricRegistryRecordsJSON = firstNonEmpty(hostReq.FabricRegistryRecordsJSON, cfg.Request.FabricRegistryRecordsJSON)
hostReq.MeshRegion = firstNonEmpty(hostReq.MeshRegion, cfg.Request.MeshRegion)
hostReq.Channel = firstNonEmpty(hostReq.Channel, cfg.Request.Channel)
hostReq.OS = firstNonEmpty(hostReq.OS, "linux")
hostReq.Arch = firstNonEmpty(hostReq.Arch, runtime.GOARCH)
@@ -473,6 +665,7 @@ func runLinuxUpdateLoop(ctx context.Context, m LinuxManager, cfg UpdateLoopConfi
hostResult, hostErr := (DockerManager{}).ApplyHostAgentUpdate(ctx, hostReq)
if hostErr != nil {
logf("linux_host_agent_update_loop run=%d status=failed error=%v", runs, hostErr)
saveUpdateLoopRescueState(cfg.Request, "linux_host_agent_update_failed", hostErr)
} else {
logf("linux_host_agent_update_loop run=%d action=%s reason=%s target=%s binary=%s replaced=%t restart_needed=%t", runs, hostResult.Action, hostResult.Reason, hostResult.TargetVersion, hostResult.NewImage, hostResult.Replaced, hostResult.RestartNeeded)
if hostResult.Action == "update" && hostResult.TargetVersion != "" && !hostResult.RolledBack {
@@ -31,7 +31,6 @@ const (
)
type MonitorConfig struct {
BackendURL string
ClusterID string
NodeID string
StateDir string
@@ -198,7 +197,6 @@ func RunMonitorOnce(ctx context.Context, cfg MonitorConfig) MonitorResult {
}
func normalizeMonitorConfig(cfg MonitorConfig) MonitorConfig {
cfg.BackendURL = strings.TrimRight(strings.TrimSpace(cfg.BackendURL), "/")
cfg.ClusterID = strings.TrimSpace(cfg.ClusterID)
cfg.NodeID = strings.TrimSpace(cfg.NodeID)
cfg.StateDir = strings.TrimSpace(cfg.StateDir)
@@ -398,7 +396,7 @@ func reportMonitorStatus(ctx context.Context, cfg MonitorConfig, result MonitorR
}
return err
}
if cfg.BackendURL == "" || clusterID == "" || nodeID == "" {
if strings.TrimSpace(cfg.FabricRegistryRecordsJSON) == "" || strings.TrimSpace(cfg.ClusterAuthorityPublicKey) == "" || clusterID == "" || nodeID == "" {
return nil
}
payload := map[string]any{
@@ -425,7 +423,6 @@ func reportMonitorStatus(ctx context.Context, cfg MonitorConfig, result MonitorR
req.ErrorMessage = &errText
}
return ReportNodeUpdateStatusForRequest(ctx, UpdateRequest{
BackendURL: cfg.BackendURL,
ClusterID: clusterID,
NodeID: nodeID,
StateDir: cfg.StateDir,
@@ -2,19 +2,39 @@ package hostagent
import (
"bytes"
"context"
"crypto/ed25519"
"encoding/base64"
"encoding/json"
"fmt"
"net/http"
"os"
"path/filepath"
"strings"
"time"
clusterauth "github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
)
func trimProfileEndpointSlice(items []string) []string {
out := make([]string, 0, len(items))
seen := map[string]struct{}{}
for _, item := range items {
trimmed := strings.TrimRight(strings.TrimSpace(item), "/")
if trimmed == "" {
continue
}
if _, ok := seen[trimmed]; ok {
continue
}
seen[trimmed] = struct{}{}
out = append(out, trimmed)
}
return out
}
type DockerInstallProfile struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ClusterAuthorityPublicKey string `json:"cluster_authority_public_key"`
ArtifactEndpoints []string `json:"artifact_endpoints"`
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records"`
DockerImageArtifact *DockerArtifact `json:"docker_image_artifact"`
@@ -29,7 +49,7 @@ type DockerInstallProfile struct {
Replace bool `json:"replace"`
DockerVPNGatewayEnabled bool `json:"docker_vpn_gateway_enabled"`
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
FabricRuntimeEnabled bool `json:"fabric_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"`
MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"`
@@ -37,15 +57,18 @@ type DockerInstallProfile struct {
VPNFabricSessionStreamShards int `json:"vpn_fabric_session_stream_shards"`
VPNFabricQUICMaxStreamsPerConn int `json:"vpn_fabric_quic_max_streams_per_conn"`
VPNFabricQUICIdleTTLSeconds int `json:"vpn_fabric_quic_idle_ttl_seconds"`
MeshListenAddr string `json:"mesh_listen_addr"`
MeshListenPortMode string `json:"mesh_listen_port_mode"`
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
FabricListenAddr string `json:"fabric_listen_addr"`
FabricListenPortMode string `json:"fabric_listen_port_mode"`
FabricListenAutoPortStart int `json:"fabric_listen_auto_port_start"`
FabricListenAutoPortEnd int `json:"fabric_listen_auto_port_end"`
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
MeshNATType string `json:"mesh_nat_type"`
MeshSiteID string `json:"mesh_site_id"`
MeshLocalityGroupID string `json:"mesh_locality_group_id"`
MeshNATGroupID string `json:"mesh_nat_group_id"`
MeshRegion string `json:"mesh_region"`
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
@@ -67,8 +90,7 @@ type DockerArtifact struct {
type WindowsInstallProfile struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ClusterAuthorityPublicKey string `json:"cluster_authority_public_key"`
ArtifactEndpoints []string `json:"artifact_endpoints"`
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records"`
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
@@ -78,7 +100,7 @@ type WindowsInstallProfile struct {
InstallDir string `json:"install_dir"`
StartupMode string `json:"startup_mode"`
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
FabricRuntimeEnabled bool `json:"fabric_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"`
MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"`
@@ -86,15 +108,18 @@ type WindowsInstallProfile struct {
VPNFabricSessionStreamShards int `json:"vpn_fabric_session_stream_shards"`
VPNFabricQUICMaxStreamsPerConn int `json:"vpn_fabric_quic_max_streams_per_conn"`
VPNFabricQUICIdleTTLSeconds int `json:"vpn_fabric_quic_idle_ttl_seconds"`
MeshListenAddr string `json:"mesh_listen_addr"`
MeshListenPortMode string `json:"mesh_listen_port_mode"`
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
FabricListenAddr string `json:"fabric_listen_addr"`
FabricListenPortMode string `json:"fabric_listen_port_mode"`
FabricListenAutoPortStart int `json:"fabric_listen_auto_port_start"`
FabricListenAutoPortEnd int `json:"fabric_listen_auto_port_end"`
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
MeshNATType string `json:"mesh_nat_type"`
MeshSiteID string `json:"mesh_site_id"`
MeshLocalityGroupID string `json:"mesh_locality_group_id"`
MeshNATGroupID string `json:"mesh_nat_group_id"`
MeshRegion string `json:"mesh_region"`
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
@@ -106,8 +131,7 @@ type WindowsInstallProfile struct {
type LinuxInstallProfile struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
BackendURL string `json:"backend_url"`
ControlPlaneEndpoints []string `json:"control_plane_endpoints"`
ClusterAuthorityPublicKey string `json:"cluster_authority_public_key"`
ArtifactEndpoints []string `json:"artifact_endpoints"`
FabricRegistryRecords json.RawMessage `json:"fabric_registry_records"`
NodeAgentArtifact *DockerArtifact `json:"node_agent_artifact"`
@@ -117,7 +141,7 @@ type LinuxInstallProfile struct {
InstallDir string `json:"install_dir"`
StartupMode string `json:"startup_mode"`
WorkloadSupervisionEnabled bool `json:"workload_supervision_enabled"`
MeshSyntheticRuntimeEnabled bool `json:"mesh_synthetic_runtime_enabled"`
FabricRuntimeEnabled bool `json:"fabric_runtime_enabled"`
MeshProductionForwardingEnabled bool `json:"mesh_production_forwarding_enabled"`
VPNFabricSessionTransportEnabled bool `json:"vpn_fabric_session_transport_enabled"`
MeshQUICFabricEnabled bool `json:"mesh_quic_fabric_enabled"`
@@ -125,15 +149,18 @@ type LinuxInstallProfile struct {
VPNFabricSessionStreamShards int `json:"vpn_fabric_session_stream_shards"`
VPNFabricQUICMaxStreamsPerConn int `json:"vpn_fabric_quic_max_streams_per_conn"`
VPNFabricQUICIdleTTLSeconds int `json:"vpn_fabric_quic_idle_ttl_seconds"`
MeshListenAddr string `json:"mesh_listen_addr"`
MeshListenPortMode string `json:"mesh_listen_port_mode"`
MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"`
MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"`
FabricListenAddr string `json:"fabric_listen_addr"`
FabricListenPortMode string `json:"fabric_listen_port_mode"`
FabricListenAutoPortStart int `json:"fabric_listen_auto_port_start"`
FabricListenAutoPortEnd int `json:"fabric_listen_auto_port_end"`
MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"`
MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"`
MeshAdvertiseTransport string `json:"mesh_advertise_transport"`
MeshConnectivityMode string `json:"mesh_connectivity_mode"`
MeshNATType string `json:"mesh_nat_type"`
MeshSiteID string `json:"mesh_site_id"`
MeshLocalityGroupID string `json:"mesh_locality_group_id"`
MeshNATGroupID string `json:"mesh_nat_group_id"`
MeshRegion string `json:"mesh_region"`
HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"`
EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"`
@@ -143,152 +170,188 @@ type LinuxInstallProfile struct {
}
type ProfileRequest struct {
URL string
ClusterID string
InstallToken string
NodeName string
HTTPClient *http.Client
ClusterID string
NodeName string
}
func FetchDockerInstallProfile(ctx context.Context, req ProfileRequest) (DockerInstallProfile, error) {
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
return DockerInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
}
if !strings.HasSuffix(url, "/node-agents/docker-install-profile") {
url += "/node-agents/docker-install-profile"
}
body, err := json.Marshal(map[string]string{
"cluster_id": strings.TrimSpace(req.ClusterID),
"install_token": strings.TrimSpace(req.InstallToken),
"node_name": strings.TrimSpace(req.NodeName),
})
if err != nil {
return DockerInstallProfile{}, err
}
httpClient := req.HTTPClient
if httpClient == nil {
httpClient = &http.Client{Timeout: 20 * time.Second}
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
return DockerInstallProfile{}, err
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := httpClient.Do(httpReq)
if err != nil {
return DockerInstallProfile{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return DockerInstallProfile{}, fmt.Errorf("fetch docker install profile: %s", resp.Status)
}
var envelope struct {
Profile DockerInstallProfile `json:"docker_install_profile"`
}
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
return DockerInstallProfile{}, err
}
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
}
return envelope.Profile, nil
type JoinBundle struct {
DockerInstallProfile *DockerInstallProfile `json:"docker_install_profile,omitempty"`
WindowsInstallProfile *WindowsInstallProfile `json:"windows_install_profile,omitempty"`
LinuxInstallProfile *LinuxInstallProfile `json:"linux_install_profile,omitempty"`
}
func FetchWindowsInstallProfile(ctx context.Context, req ProfileRequest) (WindowsInstallProfile, error) {
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
return WindowsInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
}
if !strings.HasSuffix(url, "/node-agents/windows-install-profile") {
url += "/node-agents/windows-install-profile"
}
body, err := json.Marshal(map[string]string{
"cluster_id": strings.TrimSpace(req.ClusterID),
"install_token": strings.TrimSpace(req.InstallToken),
"node_name": strings.TrimSpace(req.NodeName),
})
if err != nil {
return WindowsInstallProfile{}, err
}
httpClient := req.HTTPClient
if httpClient == nil {
httpClient = &http.Client{Timeout: 20 * time.Second}
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
return WindowsInstallProfile{}, err
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := httpClient.Do(httpReq)
if err != nil {
return WindowsInstallProfile{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return WindowsInstallProfile{}, fmt.Errorf("fetch windows install profile: %s", resp.Status)
}
var envelope struct {
Profile WindowsInstallProfile `json:"windows_install_profile"`
}
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
return WindowsInstallProfile{}, err
}
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
}
return envelope.Profile, nil
type ClusterAuthorityDescriptor struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
AuthorityState string `json:"authority_state"`
KeyAlgorithm string `json:"key_algorithm"`
PublicKey string `json:"public_key"`
PublicKeyFingerprint string `json:"public_key_fingerprint"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
func FetchLinuxInstallProfile(ctx context.Context, req ProfileRequest) (LinuxInstallProfile, error) {
url := strings.TrimRight(strings.TrimSpace(req.URL), "/")
if url == "" || strings.TrimSpace(req.InstallToken) == "" {
return LinuxInstallProfile{}, fmt.Errorf("profile-url and install-token are required")
type ClusterSignature struct {
SchemaVersion string `json:"schema_version"`
Algorithm string `json:"algorithm"`
KeyFingerprint string `json:"key_fingerprint"`
Signature string `json:"signature"`
SignedAt time.Time `json:"signed_at"`
}
type joinBundleEnvelope struct {
SchemaVersion string `json:"schema_version,omitempty"`
BundleKind string `json:"bundle_kind,omitempty"`
ClusterID string `json:"cluster_id,omitempty"`
ClusterAuthority *ClusterAuthorityDescriptor `json:"cluster_authority,omitempty"`
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"`
}
type joinBundleProfileIdentity struct {
ClusterID string `json:"cluster_id"`
ClusterAuthorityPublicKey string `json:"cluster_authority_public_key"`
}
func LoadDockerJoinBundle(path string) (DockerInstallProfile, error) {
var profile DockerInstallProfile
if err := loadJoinBundleProfile(path, "docker_install_profile", &profile); err != nil {
return DockerInstallProfile{}, err
}
if !strings.HasSuffix(url, "/node-agents/linux-install-profile") {
url += "/node-agents/linux-install-profile"
return profile, nil
}
func LoadWindowsJoinBundle(path string) (WindowsInstallProfile, error) {
var profile WindowsInstallProfile
if err := loadJoinBundleProfile(path, "windows_install_profile", &profile); err != nil {
return WindowsInstallProfile{}, err
}
body, err := json.Marshal(map[string]string{
"cluster_id": strings.TrimSpace(req.ClusterID),
"install_token": strings.TrimSpace(req.InstallToken),
"node_name": strings.TrimSpace(req.NodeName),
})
return profile, nil
}
func LoadLinuxJoinBundle(path string) (LinuxInstallProfile, error) {
var profile LinuxInstallProfile
if err := loadJoinBundleProfile(path, "linux_install_profile", &profile); err != nil {
return LinuxInstallProfile{}, err
}
return profile, nil
}
func SaveJoinBundle(path string, raw []byte) error {
path = strings.TrimSpace(path)
if path == "" {
return fmt.Errorf("join-bundle path is required")
}
if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil {
return err
}
return os.WriteFile(path, raw, 0o600)
}
func loadJoinBundleProfile(path, key string, target any) error {
path = strings.TrimSpace(path)
if path == "" {
return fmt.Errorf("join-bundle is required")
}
payload, err := os.ReadFile(path)
if err != nil {
return LinuxInstallProfile{}, err
return err
}
httpClient := req.HTTPClient
if httpClient == nil {
httpClient = &http.Client{Timeout: 20 * time.Second}
_, err = parseJoinBundleProfileBytes(payload, key, target)
return err
}
func parseJoinBundleProfileBytes(payload []byte, key string, target any) ([]byte, error) {
var envelopeMap map[string]json.RawMessage
if err := json.Unmarshal(payload, &envelopeMap); err == nil {
profileRaw := envelopeMap[key]
if len(bytes.TrimSpace(profileRaw)) > 0 {
if err := verifyJoinBundleEnvelope(payload, key, profileRaw); err != nil {
return nil, err
}
if err := json.Unmarshal(profileRaw, target); err != nil {
return nil, err
}
return profileRaw, nil
}
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
return nil, fmt.Errorf("join bundle envelope is missing signed install profile payload")
}
func verifyJoinBundleEnvelope(payload []byte, profileKey string, profileRaw json.RawMessage) error {
var envelope joinBundleEnvelope
if err := json.Unmarshal(payload, &envelope); err != nil {
return fmt.Errorf("decode join bundle envelope: %w", err)
}
if envelope.ClusterAuthority == nil && len(bytes.TrimSpace(envelope.AuthorityPayload)) == 0 && envelope.AuthoritySignature == nil {
return fmt.Errorf("join bundle authority envelope is missing")
}
if envelope.ClusterAuthority == nil || len(bytes.TrimSpace(envelope.AuthorityPayload)) == 0 || envelope.AuthoritySignature == nil {
return fmt.Errorf("join bundle authority envelope is incomplete")
}
envelopeClusterID := strings.TrimSpace(envelope.ClusterID)
authorityClusterID := strings.TrimSpace(envelope.ClusterAuthority.ClusterID)
if envelopeClusterID == "" || authorityClusterID == "" || envelopeClusterID != authorityClusterID {
return fmt.Errorf("join bundle cluster identity is inconsistent")
}
signature := clusterauth.Signature{
SchemaVersion: envelope.AuthoritySignature.SchemaVersion,
Algorithm: envelope.AuthoritySignature.Algorithm,
KeyFingerprint: envelope.AuthoritySignature.KeyFingerprint,
Signature: envelope.AuthoritySignature.Signature,
}
if err := clusterauth.VerifyRaw(envelope.ClusterAuthority.PublicKey, envelope.AuthorityPayload, signature); err != nil {
return fmt.Errorf("verify join bundle authority signature: %w", err)
}
var signedProfiles map[string]json.RawMessage
if err := json.Unmarshal(envelope.AuthorityPayload, &signedProfiles); err != nil {
return fmt.Errorf("decode join bundle authority payload: %w", err)
}
signedProfileRaw := signedProfiles[profileKey]
if len(bytes.TrimSpace(signedProfileRaw)) == 0 {
return fmt.Errorf("join bundle authority payload missing %s", profileKey)
}
want, err := clusterauth.CanonicalJSON(signedProfileRaw)
if err != nil {
return LinuxInstallProfile{}, err
return err
}
httpReq.Header.Set("Content-Type", "application/json")
resp, err := httpClient.Do(httpReq)
got, err := clusterauth.CanonicalJSON(profileRaw)
if err != nil {
return LinuxInstallProfile{}, err
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return LinuxInstallProfile{}, fmt.Errorf("fetch linux install profile: %s", resp.Status)
if !bytes.Equal(want, got) {
return fmt.Errorf("join bundle profile does not match signed authority payload")
}
var envelope struct {
Profile LinuxInstallProfile `json:"linux_install_profile"`
authorityPublicKey := strings.TrimSpace(envelope.ClusterAuthority.PublicKey)
if authorityPublicKey == "" {
return fmt.Errorf("join bundle authority public key is empty")
}
if err := json.NewDecoder(resp.Body).Decode(&envelope); err != nil {
return LinuxInstallProfile{}, err
if fingerprint := strings.TrimSpace(envelope.ClusterAuthority.PublicKeyFingerprint); fingerprint != "" {
publicKey, err := base64.StdEncoding.DecodeString(authorityPublicKey)
if err != nil || len(publicKey) != ed25519.PublicKeySize {
return fmt.Errorf("join bundle authority public key is invalid")
}
if fingerprint != clusterauth.Fingerprint(ed25519.PublicKey(publicKey)) {
return fmt.Errorf("join bundle authority fingerprint does not match authority public key")
}
}
if strings.TrimSpace(envelope.Profile.BackendURL) == "" && len(envelope.Profile.ControlPlaneEndpoints) > 0 {
envelope.Profile.BackendURL = envelope.Profile.ControlPlaneEndpoints[0]
var identity joinBundleProfileIdentity
if err := json.Unmarshal(profileRaw, &identity); err != nil {
return fmt.Errorf("decode join bundle profile identity: %w", err)
}
return envelope.Profile, nil
if strings.TrimSpace(identity.ClusterID) == "" || strings.TrimSpace(identity.ClusterID) != envelopeClusterID {
return fmt.Errorf("join bundle profile cluster_id does not match signed bundle cluster_id")
}
if strings.TrimSpace(identity.ClusterAuthorityPublicKey) == "" || strings.TrimSpace(identity.ClusterAuthorityPublicKey) != authorityPublicKey {
return fmt.Errorf("join bundle profile authority key does not match signed bundle authority key")
}
return nil
}
func RuntimeConfigFromProfile(profile DockerInstallProfile) RuntimeConfig {
return RuntimeConfig{
BackendURL: profile.BackendURL,
ClusterID: profile.ClusterID,
ClusterAuthorityPublicKey: strings.TrimSpace(profile.ClusterAuthorityPublicKey),
JoinToken: profile.JoinToken,
NodeName: profile.NodeName,
Image: profile.Image,
@@ -300,7 +363,7 @@ func RuntimeConfigFromProfile(profile DockerInstallProfile) RuntimeConfig {
Replace: profile.Replace,
DockerVPNGatewayEnabled: profile.DockerVPNGatewayEnabled,
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
FabricRuntimeEnabled: profile.FabricRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled,
MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled,
@@ -308,16 +371,19 @@ func RuntimeConfigFromProfile(profile DockerInstallProfile) RuntimeConfig {
VPNFabricSessionStreamShards: profile.VPNFabricSessionStreamShards,
VPNFabricQUICMaxStreamsPerConn: profile.VPNFabricQUICMaxStreamsPerConn,
VPNFabricQUICIdleTTLSeconds: profile.VPNFabricQUICIdleTTLSeconds,
MeshListenAddr: profile.MeshListenAddr,
MeshListenPortMode: profile.MeshListenPortMode,
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
FabricListenAddr: profile.FabricListenAddr,
FabricListenPortMode: profile.FabricListenPortMode,
FabricListenAutoPortStart: profile.FabricListenAutoPortStart,
FabricListenAutoPortEnd: profile.FabricListenAutoPortEnd,
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
FabricRegistryRecordsJSON: string(profile.FabricRegistryRecords),
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
MeshConnectivityMode: profile.MeshConnectivityMode,
MeshNATType: profile.MeshNATType,
MeshSiteID: profile.MeshSiteID,
MeshLocalityGroupID: firstNonEmpty(profile.MeshLocalityGroupID, profile.MeshSiteID),
MeshNATGroupID: profile.MeshNATGroupID,
MeshRegion: profile.MeshRegion,
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
@@ -2,6 +2,7 @@ package hostagent
import (
"context"
"encoding/json"
"errors"
"fmt"
"os"
@@ -10,7 +11,6 @@ import (
)
type HostAgentUpdateRequest struct {
BackendURL string
ClusterID string
NodeID string
StateDir string
@@ -40,7 +40,6 @@ type HostAgentUpdateLoopConfig struct {
func (req HostAgentUpdateRequest) updateRequest() UpdateRequest {
return UpdateRequest{
BackendURL: req.BackendURL,
ClusterID: req.ClusterID,
NodeID: req.NodeID,
StateDir: req.StateDir,
@@ -79,6 +78,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
}
if plan.Action != "update" {
if !req.DryRun {
_ = saveUpdatePlanState(resolved, plan, resolved.CurrentVersion, "host-agent-service", binaryPath)
status := statusFromNoopPlan(resolved, plan)
status.Product = HostAgentUpdateProduct
if status.Payload == nil {
@@ -102,7 +102,6 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
if req.DryRun {
return result, nil
}
urls := artifactURLsForBackend(*plan.Artifact, resolved.BackendURL)
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion,
@@ -111,14 +110,24 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
Status: "started",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": binaryPath},
Payload: map[string]any{"artifact_id": plan.Artifact.ID, "binary_path": binaryPath, "transport": updateArtifactTransport(resolved, plan)},
})
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
path, distributors, err := downloadUpdateArtifact(ctx, resolved, plan)
if err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, statusFromError(resolved, plan, "download", "failed", err))
return result, err
}
defer os.Remove(path)
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "download",
Status: "succeeded",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"artifact_id": plan.Artifact.ID, "binary_path": binaryPath, "fabric_distributors": distributors, "transport": updateArtifactTransport(resolved, plan)},
})
if err := installHostAgentBinary(path, binaryPath); err != nil {
stageErr := stageHostAgentBinary(path, binaryPath)
if stageErr == nil {
@@ -129,7 +138,24 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
TargetVersion: plan.TargetVersion,
ContainerName: "host-agent-service",
Image: binaryPath,
UpdatedAt: time.Now().UTC(),
PlanAction: plan.Action,
PlanReason: plan.Reason,
UpdateIntent: plan.UpdateIntent,
RolloutLease: plan.RolloutLease,
AuthorityPayload: func() json.RawMessage {
if len(plan.AuthorityPayload) == 0 {
return nil
}
return append(json.RawMessage(nil), plan.AuthorityPayload...)
}(),
AuthoritySignature: func() json.RawMessage {
if len(plan.AuthoritySignature) == 0 {
return nil
}
return append(json.RawMessage(nil), plan.AuthoritySignature...)
}(),
AuthorityQuorum: plan.AuthorityQuorum,
UpdatedAt: time.Now().UTC(),
})
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
@@ -149,14 +175,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
result.Loaded = true
result.Replaced = true
result.RestartNeeded = true
_ = saveUpdateState(resolved.StateDir, UpdateState{
Product: HostAgentUpdateProduct,
CurrentVersion: plan.TargetVersion,
TargetVersion: plan.TargetVersion,
ContainerName: "host-agent-service",
Image: binaryPath,
UpdatedAt: time.Now().UTC(),
})
_ = saveUpdatePlanState(resolved, plan, plan.TargetVersion, "host-agent-service", binaryPath)
_ = ReportNodeUpdateStatusForRequest(ctx, resolved, NodeUpdateStatusRequest{
Product: HostAgentUpdateProduct,
CurrentVersion: resolved.CurrentVersion,
@@ -183,7 +202,7 @@ func (m DockerManager) ApplyHostAgentUpdate(ctx context.Context, req HostAgentUp
func (m DockerManager) RunHostAgentUpdateLoop(ctx context.Context, cfg HostAgentUpdateLoopConfig) error {
if cfg.Interval == 0 {
cfg.Interval = time.Hour
cfg.Interval = time.Duration(DefaultUpdateIntervalSec) * time.Second
}
if cfg.InitialDelay < 0 || cfg.Interval < 0 {
return errors.New("host-agent update loop durations must not be negative")
@@ -191,6 +210,9 @@ func (m DockerManager) RunHostAgentUpdateLoop(ctx context.Context, cfg HostAgent
if cfg.Jitter < 0 || cfg.Jitter > 1 {
return errors.New("host-agent update loop jitter must be between 0 and 1")
}
if err := ReconcileSignedUpdateState(cfg.Request.StateDir); err != nil {
return err
}
logf := cfg.Logf
if logf == nil {
logf = func(string, ...any) {}
@@ -202,6 +224,7 @@ func (m DockerManager) RunHostAgentUpdateLoop(ctx context.Context, cfg HostAgent
}
runs := 0
req := cfg.Request
lastTriggerGeneration := currentUpdateTriggerGeneration(req.StateDir)
for {
runs++
result, err := m.ApplyHostAgentUpdate(ctx, req)
@@ -210,6 +233,7 @@ func (m DockerManager) RunHostAgentUpdateLoop(ctx context.Context, cfg HostAgent
logf("host_agent_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, req.StateDir)
} else {
logf("host_agent_update_loop run=%d status=failed error=%v", runs, err)
saveUpdateLoopRescueState(req.updateRequest(), "host_agent_self_update_failed", err)
if cfg.StopOnError {
return err
}
@@ -231,7 +255,7 @@ func (m DockerManager) RunHostAgentUpdateLoop(ctx context.Context, cfg HostAgent
if cfg.MaxRuns > 0 && runs >= cfg.MaxRuns {
return nil
}
if err := sleepContext(ctx, jitteredDuration(cfg.Interval, cfg.Jitter)); err != nil {
if err := sleepUntilUpdateIntervalOrTrigger(ctx, req.StateDir, jitteredDuration(cfg.Interval, cfg.Jitter), &lastTriggerGeneration); err != nil {
return err
}
}
@@ -13,6 +13,7 @@ import (
const (
DefaultHostAgentInstallPath = "/usr/local/bin/rap-host-agent"
DefaultSystemdUnitDir = "/etc/systemd/system"
DefaultUpdateIntervalSec = 120
)
type UpdateServiceConfig struct {
@@ -62,7 +63,7 @@ func (m DockerManager) InstallUpdateService(ctx context.Context, cfg UpdateServi
cfg.Product = DefaultUpdateProduct
}
if cfg.IntervalSeconds == 0 {
cfg.IntervalSeconds = 21600
cfg.IntervalSeconds = DefaultUpdateIntervalSec
}
if cfg.Jitter == 0 {
cfg.Jitter = 0.15
@@ -173,8 +174,11 @@ func (m DockerManager) InstallUpdateService(ctx context.Context, cfg UpdateServi
func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
runtimeCfg := cfg.RuntimeConfig.Normalize()
var missing []string
if runtimeCfg.BackendURL == "" && runtimeCfg.FabricRegistryRecordsJSON == "" {
missing = append(missing, "backend-url-or-fabric-registry-records-json")
if runtimeCfg.FabricRegistryRecordsJSON == "" {
missing = append(missing, "fabric-registry-records-json")
}
if runtimeCfg.ClusterAuthorityPublicKey == "" {
missing = append(missing, "cluster-authority-public-key")
}
if runtimeCfg.ClusterID == "" {
missing = append(missing, "cluster-id")
@@ -201,13 +205,10 @@ func buildUpdateServiceUnit(cfg UpdateServiceConfig) (string, error) {
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
"--health-timeout-seconds", fmt.Sprintf("%d", cfg.HealthTimeoutSec),
}
if runtimeCfg.BackendURL != "" {
args = append(args, "--backend-url", runtimeCfg.BackendURL)
}
if strings.TrimSpace(cfg.Channel) != "" {
args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
}
args = appendFabricUpdateArgs(args, runtimeCfg)
args = appendFabricUpdateArgs(args, runtimeCfg, true)
execStart := systemdJoin(args)
return fmt.Sprintf(`[Unit]
Description=RAP host-agent updater for %s
@@ -228,8 +229,8 @@ WantedBy=multi-user.target
func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, string, error) {
runtimeCfg := cfg.RuntimeConfig.Normalize()
if (runtimeCfg.BackendURL == "" && runtimeCfg.FabricRegistryRecordsJSON == "") || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
return "", "", "", fmt.Errorf("backend-url-or-fabric-registry-records-json, cluster-id, and state-dir are required for host-agent self updater")
if runtimeCfg.FabricRegistryRecordsJSON == "" || runtimeCfg.ClusterAuthorityPublicKey == "" || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
return "", "", "", fmt.Errorf("fabric-registry-records-json, cluster-authority-public-key, cluster-id, and state-dir are required for host-agent self updater")
}
unitName := "rap-host-agent-self-updater.service"
unitPath := filepath.Join(firstNonEmpty(cfg.UnitDir, DefaultSystemdUnitDir), unitName)
@@ -245,13 +246,10 @@ func buildHostAgentSelfUpdateUnit(cfg UpdateServiceConfig) (string, string, stri
"--initial-delay-seconds", fmt.Sprintf("%d", cfg.InitialDelaySeconds+30),
"--jitter", fmt.Sprintf("%.3f", cfg.Jitter),
}
if runtimeCfg.BackendURL != "" {
args = append(args, "--backend-url", runtimeCfg.BackendURL)
}
if strings.TrimSpace(cfg.Channel) != "" {
args = append(args, "--channel", strings.TrimSpace(cfg.Channel))
}
args = appendFabricUpdateArgs(args, runtimeCfg)
args = appendFabricUpdateArgs(args, runtimeCfg, true)
return fmt.Sprintf(`[Unit]
Description=RAP host-agent self updater
After=network-online.target docker.service
@@ -271,8 +269,8 @@ WantedBy=multi-user.target
func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string, error) {
runtimeCfg := cfg.RuntimeConfig.Normalize()
if (runtimeCfg.BackendURL == "" && runtimeCfg.FabricRegistryRecordsJSON == "") || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
return "", "", "", fmt.Errorf("backend-url-or-fabric-registry-records-json, cluster-id, and state-dir are required for host monitor")
if runtimeCfg.FabricRegistryRecordsJSON == "" || runtimeCfg.ClusterAuthorityPublicKey == "" || runtimeCfg.ClusterID == "" || runtimeCfg.StateDir == "" {
return "", "", "", fmt.Errorf("fabric-registry-records-json, cluster-authority-public-key, cluster-id, and state-dir are required for host monitor")
}
containers := uniqueTrimmed(append([]string{runtimeCfg.ContainerName}, cfg.MonitorContainers...))
if len(containers) == 0 {
@@ -291,9 +289,6 @@ func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string,
"--disk-cleanup-percent", fmt.Sprintf("%d", firstNonZero(cfg.MonitorDiskCleanup, DefaultMonitorDiskCleanupPercent)),
"--disk-critical-percent", fmt.Sprintf("%d", firstNonZero(cfg.MonitorDiskCritical, DefaultMonitorDiskCriticalPercent)),
}
if runtimeCfg.BackendURL != "" {
args = append(args, "--backend-url", runtimeCfg.BackendURL)
}
if cfg.MonitorCleanupDocker {
args = append(args, "--cleanup-docker")
}
@@ -303,7 +298,7 @@ func buildHostAgentMonitorUnit(cfg UpdateServiceConfig) (string, string, string,
for _, container := range containers {
args = append(args, "--watch-container", container)
}
args = appendFabricUpdateArgs(args, runtimeCfg)
args = appendFabricUpdateArgs(args, runtimeCfg, true)
return fmt.Sprintf(`[Unit]
Description=RAP host-agent monitor for %s
After=network-online.target docker.service
@@ -321,13 +316,25 @@ WantedBy=multi-user.target
`, runtimeCfg.ContainerName, systemdJoin(args)), unitName, unitPath, nil
}
func appendFabricUpdateArgs(args []string, runtimeCfg RuntimeConfig) []string {
if strings.TrimSpace(runtimeCfg.FabricRegistryRecordsJSON) != "" {
func appendFabricUpdateArgs(args []string, runtimeCfg RuntimeConfig, includeStructured bool) []string {
if includeStructured && strings.TrimSpace(runtimeCfg.FabricRegistryRecordsJSON) != "" {
args = append(args, "--fabric-registry-records-json", strings.TrimSpace(runtimeCfg.FabricRegistryRecordsJSON))
}
if strings.TrimSpace(runtimeCfg.ClusterAuthorityPublicKey) != "" {
args = append(args, "--cluster-authority-public-key", strings.TrimSpace(runtimeCfg.ClusterAuthorityPublicKey))
}
if strings.TrimSpace(runtimeCfg.MeshRegion) != "" {
args = append(args, "--mesh-region", strings.TrimSpace(runtimeCfg.MeshRegion))
}
if strings.TrimSpace(runtimeCfg.MeshSiteID) != "" {
args = append(args, "--mesh-site-id", strings.TrimSpace(runtimeCfg.MeshSiteID))
}
if strings.TrimSpace(runtimeCfg.MeshLocalityGroupID) != "" {
args = append(args, "--mesh-locality-group-id", strings.TrimSpace(runtimeCfg.MeshLocalityGroupID))
}
if strings.TrimSpace(runtimeCfg.MeshNATGroupID) != "" {
args = append(args, "--mesh-nat-group-id", strings.TrimSpace(runtimeCfg.MeshNATGroupID))
}
return args
}
@@ -18,11 +18,14 @@ func TestInstallUpdateServiceWritesSystemdUnit(t *testing.T) {
binaryPath := filepath.Join(dir, "bin", "rap-host-agent")
result, err := (DockerManager{}).InstallUpdateService(context.Background(), UpdateServiceConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
NodeName: "node-a",
ContainerName: "rap-node-agent-node-a",
StateDir: "/var/lib/rap/nodes/node-a",
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
NodeName: "node-a",
ContainerName: "rap-node-agent-node-a",
StateDir: "/var/lib/rap/nodes/node-a",
MeshSiteID: "home",
MeshLocalityGroupID: "home-lan",
},
CurrentVersion: "0.1.0-current",
IntervalSeconds: 60,
@@ -51,8 +54,11 @@ func TestInstallUpdateServiceWritesSystemdUnit(t *testing.T) {
for _, want := range []string{
"ExecStart=",
" update-loop",
"--backend-url http://control/api/v1",
`--fabric-registry-records-json "[{\"schema\":\"rap.fabric.registry.gossip_record.v1\",\"service_class\":\"control-api\"}]"`,
`--cluster-authority-public-key authority-key-b64`,
"--cluster-id cluster-1",
"--mesh-site-id home",
"--mesh-locality-group-id home-lan",
"--state-dir /var/lib/rap/nodes/node-a",
"--container-name rap-node-agent-node-a",
"--current-version 0.1.0-current",
@@ -76,6 +82,9 @@ func TestInstallUpdateServiceWritesSystemdUnit(t *testing.T) {
if text := string(selfUnit); !strings.Contains(text, "update-host-agent-loop") || !strings.Contains(text, "--current-version 0.1.0-host") {
t.Fatalf("unexpected self unit:\n%s", text)
}
if text := string(selfUnit); !strings.Contains(text, "--fabric-registry-records-json") {
t.Fatalf("unexpected self updater unit structured args:\n%s", text)
}
if result.MonitorUnitName == "" || result.MonitorUnitPath == "" {
t.Fatalf("monitor result = %+v", result)
}
@@ -95,13 +104,57 @@ func TestInstallUpdateServiceWritesSystemdUnit(t *testing.T) {
t.Fatalf("monitor unit missing %q:\n%s", want, monitorText)
}
}
if !strings.Contains(monitorText, "--fabric-registry-records-json") {
t.Fatalf("unexpected monitor unit structured args:\n%s", monitorText)
}
}
func TestInstallUpdateServiceDefaultsToRescuePollInterval(t *testing.T) {
dir := t.TempDir()
source := filepath.Join(dir, "rap-host-agent-src")
if err := os.WriteFile(source, []byte("binary"), 0o755); err != nil {
t.Fatalf("write source: %v", err)
}
result, err := (DockerManager{}).InstallUpdateService(context.Background(), UpdateServiceConfig{
RuntimeConfig: RuntimeConfig{
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
ContainerName: "rap-node-agent-node-a",
StateDir: "/var/lib/rap/nodes/node-a",
},
CurrentVersion: "0.1.0-current",
SourceBinaryPath: source,
BinaryInstallPath: filepath.Join(dir, "bin", "rap-host-agent"),
UnitDir: filepath.Join(dir, "systemd"),
ManageSystemd: false,
InstallSelfUpdater: true,
})
if err != nil {
t.Fatalf("install update service: %v", err)
}
unit, err := os.ReadFile(result.UnitPath)
if err != nil {
t.Fatalf("read update unit: %v", err)
}
if !strings.Contains(string(unit), "--interval-seconds 120") {
t.Fatalf("update unit should default to rescue poll interval:\n%s", unit)
}
selfUnit, err := os.ReadFile(result.SelfUnitPath)
if err != nil {
t.Fatalf("read self update unit: %v", err)
}
if !strings.Contains(string(selfUnit), "--interval-seconds 120") {
t.Fatalf("self update unit should default to rescue poll interval:\n%s", selfUnit)
}
}
func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
cfg := WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
},
NodeID: "node-1",
AutoUpdateCurrentVersion: "0.1.2",
@@ -117,10 +170,11 @@ func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
}
script := windowsHostAgentUpdateScript(`C:\Program Files\RAP\win-a\rap-host-agent.exe`, cfg, result)
for _, want := range []string{
":loop",
"RAP_HOST_AGENT_UPDATE_LOCK_DIR",
"rap-host-agent.exe.next",
"update-loop --cluster-id",
"--backend-url \"http://control/api/v1\"",
"update-loop --max-runs 1 --cluster-id",
`--fabric-registry-records-json [{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
"--cluster-authority-public-key authority-key-b64",
"--cluster-id \"cluster-1\"",
"--node-id \"node-1\"",
"--state-dir \"C:\\ProgramData\\RAP\\nodes\\win-a\"",
@@ -131,7 +185,7 @@ func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
"--current-version 0.1.2",
"--host-agent-current-version 0.1.2",
"--interval-seconds 120",
"timeout /t 120",
"wake-interval-seconds 120",
} {
if !strings.Contains(script, want) {
t.Fatalf("script missing %q:\n%s", want, script)
@@ -139,12 +193,12 @@ func TestWindowsHostAgentUpdateScriptTargetsWindowsService(t *testing.T) {
}
}
func TestWindowsHostAgentUpdateScriptOmitsEmptyBackendURL(t *testing.T) {
func TestWindowsHostAgentUpdateScriptIncludesFabricRegistry(t *testing.T) {
cfg := WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
ClusterID: "cluster-1",
FabricRegistryRecordsJSON: `[{"record_id":"r1"}]`,
MeshRegion: "ru-msk",
ClusterID: "cluster-1",
FabricRegistryRecordsJSON: `[{"record_id":"r1"}]`,
MeshRegion: "ru-msk",
},
AutoUpdateCurrentVersion: "0.1.2",
}
@@ -155,9 +209,6 @@ func TestWindowsHostAgentUpdateScriptOmitsEmptyBackendURL(t *testing.T) {
TaskName: "RAP Node Agent win-a",
}
script := windowsHostAgentUpdateScript(`C:\Program Files\RAP\win-a\rap-host-agent.exe`, cfg, result)
if strings.Contains(script, "--backend-url") {
t.Fatalf("script must not include backend-url when it is empty:\n%s", script)
}
for _, want := range []string{
`--fabric-registry-records-json [{"record_id":"r1"}]`,
"--mesh-region ru-msk",
@@ -171,9 +222,10 @@ func TestWindowsHostAgentUpdateScriptOmitsEmptyBackendURL(t *testing.T) {
func TestWindowsInstallReplaceAllowsExistingNodeWithoutJoinToken(t *testing.T) {
result, err := (WindowsManager{}).Install(context.Background(), WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
NodeName: "win-a",
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
NodeName: "win-a",
},
InstallDir: `C:\Program Files\RAP\win-a`,
Replace: true,
@@ -202,8 +254,9 @@ func TestWindowsRepairUpdaterStartsFromUnknownVersion(t *testing.T) {
StartupMode: "user-task",
}, WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: "http://control/api/v1",
ClusterID: "cluster-1",
ClusterID: "cluster-1",
ClusterAuthorityPublicKey: "authority-key-b64",
FabricRegistryRecordsJSON: `[{"schema":"rap.fabric.registry.gossip_record.v1","service_class":"control-api"}]`,
},
Replace: true,
AutoUpdateEnabled: true,
@@ -219,4 +272,57 @@ func TestWindowsRepairUpdaterStartsFromUnknownVersion(t *testing.T) {
if !strings.Contains(string(script), "--current-version 0.0.0") {
t.Fatalf("repair updater should force unknown current version:\n%s", script)
}
if !strings.Contains(string(script), "--max-runs 1") {
t.Fatalf("repair updater should run one-shot update-loop:\n%s", script)
}
if !strings.Contains(string(script), "RAP_HOST_AGENT_UPDATE_LOCK_DIR") {
t.Fatalf("repair updater should guard against overlapping runs:\n%s", script)
}
if !strings.Contains(string(script), "--interval-seconds 120") {
t.Fatalf("repair updater should use rescue poll interval:\n%s", script)
}
if !strings.Contains(string(script), "wake-interval-seconds 120") {
t.Fatalf("repair updater should document wake interval:\n%s", script)
}
}
func TestWindowsRepairUpdaterUsesRecurringScheduledTask(t *testing.T) {
dir := t.TempDir()
source := filepath.Join(dir, "rap-host-agent.exe")
if err := os.WriteFile(source, []byte("binary"), 0o755); err != nil {
t.Fatalf("write source: %v", err)
}
runner := &recordingRunner{}
_, err := installWindowsHostAgentUpdater(context.Background(), WindowsManager{Runner: runner}, WindowsInstallResult{
NodeName: "win-a",
InstallDir: dir,
StateDir: dir,
NodeAgentPath: filepath.Join(dir, "rap-node-agent.exe"),
TaskName: "RAP Node Agent win-a",
StartupMode: "user-task",
}, WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
ClusterID: "cluster-1",
},
Replace: true,
AutoUpdateEnabled: true,
AutoUpdateIntervalSeconds: 21600,
HostAgentSourcePath: source,
})
if err != nil {
t.Fatalf("install updater: %v", err)
}
foundMinuteTask := false
for _, call := range runner.calls {
if len(call) >= 8 && call[0] == "schtasks" && call[1] == "/Create" {
joined := strings.Join(call, " ")
if strings.Contains(joined, "/SC MINUTE") && strings.Contains(joined, "/MO 5") {
foundMinuteTask = true
break
}
}
}
if !foundMinuteTask {
t.Fatalf("expected recurring minute task, got %#v", runner.calls)
}
}
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -58,13 +58,14 @@ func WindowsInstallConfigFromProfile(profile WindowsInstallProfile) WindowsInsta
stateDir := firstNonEmpty(profile.StateDir, filepath.Join(DefaultWindowsStateRoot, safeUnitSlug(profile.NodeName)))
return WindowsInstallConfig{
RuntimeConfig: RuntimeConfig{
BackendURL: profile.BackendURL,
ClusterAuthorityPublicKey: strings.TrimSpace(profile.ClusterAuthorityPublicKey),
FabricRegistryRecordsJSON: strings.TrimSpace(string(profile.FabricRegistryRecords)),
ClusterID: profile.ClusterID,
JoinToken: profile.JoinToken,
NodeName: profile.NodeName,
StateDir: stateDir,
WorkloadSupervisionEnabled: profile.WorkloadSupervisionEnabled,
MeshSyntheticRuntimeEnabled: profile.MeshSyntheticRuntimeEnabled,
FabricRuntimeEnabled: profile.FabricRuntimeEnabled,
MeshProductionForwardingEnabled: profile.MeshProductionForwardingEnabled,
VPNFabricSessionTransportEnabled: profile.VPNFabricSessionTransportEnabled,
MeshQUICFabricEnabled: profile.MeshQUICFabricEnabled,
@@ -72,15 +73,18 @@ func WindowsInstallConfigFromProfile(profile WindowsInstallProfile) WindowsInsta
VPNFabricSessionStreamShards: profile.VPNFabricSessionStreamShards,
VPNFabricQUICMaxStreamsPerConn: profile.VPNFabricQUICMaxStreamsPerConn,
VPNFabricQUICIdleTTLSeconds: profile.VPNFabricQUICIdleTTLSeconds,
MeshListenAddr: profile.MeshListenAddr,
MeshListenPortMode: profile.MeshListenPortMode,
MeshListenAutoPortStart: profile.MeshListenAutoPortStart,
MeshListenAutoPortEnd: profile.MeshListenAutoPortEnd,
FabricListenAddr: profile.FabricListenAddr,
FabricListenPortMode: profile.FabricListenPortMode,
FabricListenAutoPortStart: profile.FabricListenAutoPortStart,
FabricListenAutoPortEnd: profile.FabricListenAutoPortEnd,
MeshAdvertiseEndpoint: profile.MeshAdvertiseEndpoint,
MeshAdvertiseEndpointsJSON: string(profile.MeshAdvertiseEndpointsJSON),
MeshAdvertiseTransport: profile.MeshAdvertiseTransport,
MeshConnectivityMode: profile.MeshConnectivityMode,
MeshNATType: profile.MeshNATType,
MeshSiteID: profile.MeshSiteID,
MeshLocalityGroupID: firstNonEmpty(profile.MeshLocalityGroupID, profile.MeshSiteID),
MeshNATGroupID: profile.MeshNATGroupID,
MeshRegion: profile.MeshRegion,
HeartbeatIntervalSeconds: profile.HeartbeatIntervalSeconds,
EnrollmentPollIntervalSeconds: profile.EnrollmentPollIntervalSeconds,
@@ -2,10 +2,12 @@ package hostagent
import (
"context"
"encoding/json"
"errors"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"time"
)
@@ -42,12 +44,22 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
}
if plan.Action != "update" {
if !req.DryRun {
restarted, err := rewriteWindowsControlPlaneRuntime(ctx, runner, m, req, plan)
if err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "rewrite_runtime", "failed", err))
return result, err
}
result.RestartNeeded = restarted
}
if !req.DryRun {
_ = saveUpdatePlanState(req, plan, req.CurrentVersion, req.WindowsTaskName, req.BinaryPath)
status := statusFromNoopPlan(req, plan)
if status.Payload == nil {
status.Payload = map[string]any{}
}
status.Payload["task"] = req.WindowsTaskName
status.Payload["binary_path"] = req.BinaryPath
status.Payload["restart_needed"] = result.RestartNeeded
_ = ReportNodeUpdateStatusForRequest(ctx, req, status)
}
return result, nil
@@ -78,9 +90,8 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
Status: "accepted",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"strategy": plan.Strategy, "reason": plan.Reason, "task": req.WindowsTaskName},
Payload: updatePlanStatusPayload(plan),
})
urls := artifactURLsForBackend(*plan.Artifact, req.BackendURL)
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
@@ -89,14 +100,24 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
Status: "started",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"artifact_url": plan.Artifact.URL, "artifact_urls": urls, "binary_path": req.BinaryPath},
Payload: map[string]any{"artifact_id": plan.Artifact.ID, "binary_path": req.BinaryPath, "transport": updateArtifactTransport(req, plan)},
})
path, err := downloadFirstArtifact(ctx, urls, plan.Artifact.SHA256, plan.Artifact.SizeBytes)
path, distributors, err := downloadUpdateArtifact(ctx, req, plan)
if err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "download", "failed", err))
return result, err
}
defer os.Remove(path)
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
TargetVersion: plan.TargetVersion,
Phase: "download",
Status: "succeeded",
AttemptID: updateAttemptID(plan),
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"artifact_id": plan.Artifact.ID, "binary_path": req.BinaryPath, "fabric_distributors": distributors, "transport": updateArtifactTransport(req, plan)},
})
m.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
if err := copyFile(path, req.BinaryPath, 0o755); err != nil {
m.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
@@ -106,10 +127,18 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
}
}
result.Replaced = true
if _, err := runner.Run(ctx, "schtasks", "/Run", "/TN", req.WindowsTaskName); err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "restart", "failed", err))
restartedByRewrite, err := rewriteWindowsControlPlaneRuntime(ctx, runner, m, req, plan)
if err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "rewrite_runtime", "failed", err))
return result, err
}
result.RestartNeeded = restartedByRewrite
if !restartedByRewrite {
if _, err := runner.Run(ctx, "schtasks", "/Run", "/TN", req.WindowsTaskName); err != nil {
_ = ReportNodeUpdateStatusForRequest(ctx, req, statusFromError(req, plan, "restart", "failed", err))
return result, err
}
}
_ = ReportNodeUpdateStatusForRequest(ctx, req, NodeUpdateStatusRequest{
Product: req.Product,
CurrentVersion: req.CurrentVersion,
@@ -120,16 +149,105 @@ func (m WindowsManager) ApplyUpdate(ctx context.Context, req UpdateRequest) (Upd
ObservedAt: time.Now().UTC(),
Payload: map[string]any{"task": req.WindowsTaskName, "binary_path": req.BinaryPath},
})
_ = saveUpdateState(req.StateDir, UpdateState{
Product: req.Product,
CurrentVersion: plan.TargetVersion,
TargetVersion: plan.TargetVersion,
Image: req.BinaryPath,
UpdatedAt: time.Now().UTC(),
})
_ = saveUpdatePlanState(req, plan, plan.TargetVersion, req.WindowsTaskName, req.BinaryPath)
return result, nil
}
func rewriteWindowsControlPlaneRuntime(ctx context.Context, runner CommandRunner, manager WindowsManager, req UpdateRequest, plan NodeUpdatePlan) (bool, error) {
_ = saveControlPlaneRuntimeState(req.StateDir, ControlPlaneRuntimeState{
SchemaVersion: "rap.control_plane_runtime_state.v1",
ClusterID: strings.TrimSpace(plan.ClusterID),
NodeID: strings.TrimSpace(plan.NodeID),
Product: strings.TrimSpace(plan.Product),
FabricRegistryRecords: append(json.RawMessage(nil), plan.FabricRegistryRecords...),
AuthorityPayload: append(json.RawMessage(nil), plan.AuthorityPayload...),
AuthoritySignature: append(json.RawMessage(nil), plan.AuthoritySignature...),
AuthorityQuorum: plan.AuthorityQuorum,
UpdatedAt: time.Now().UTC(),
})
installDir := filepath.Dir(strings.TrimSpace(req.BinaryPath))
if installDir == "" {
return false, nil
}
envPath := filepath.Join(installDir, "rap-node-agent.env.cmd")
envRegistry := strings.TrimSpace(string(plan.FabricRegistryRecords))
changed := false
if envRegistry != "" && fileExists(envPath) {
current, err := os.ReadFile(envPath)
if err != nil {
return false, err
}
updatedEnv := string(current)
updatedEnv = upsertWindowsEnvValue(updatedEnv, "RAP_FABRIC_REGISTRY_RECORDS_JSON", envRegistry)
if updatedEnv != string(current) {
if err := os.WriteFile(envPath, []byte(updatedEnv), 0o600); err != nil {
return false, err
}
changed = true
}
}
if envRegistry == "" {
return false, nil
}
wrapperPath := filepath.Join(installDir, "rap-host-agent-update.cmd")
if !fileExists(wrapperPath) {
if changed {
manager.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
if _, err := runner.Run(ctx, "schtasks", "/Run", "/TN", req.WindowsTaskName); err != nil {
return false, err
}
}
return changed, nil
}
script, err := os.ReadFile(wrapperPath)
if err != nil {
return false, err
}
updated := replaceCLIArg(string(script), "--fabric-registry-records-json", envRegistry, true)
if updated != string(script) {
if err := os.WriteFile(wrapperPath, []byte(updated), 0o755); err != nil {
return false, err
}
changed = true
}
if changed {
manager.stopExistingNodeAgent(ctx, req.WindowsTaskName, req.BinaryPath)
if _, err := runner.Run(ctx, "schtasks", "/Run", "/TN", req.WindowsTaskName); err != nil {
return false, err
}
}
return changed, nil
}
func upsertWindowsEnvValue(script string, key string, value string) string {
prefix := "set " + key + "="
lines := strings.Split(script, "\n")
for i, line := range lines {
rawLine := strings.TrimRight(line, "\r")
trimmed := strings.TrimSpace(rawLine)
if strings.HasPrefix(strings.ToLower(trimmed), strings.ToLower(prefix)) {
if value == "" {
lines = append(lines[:i], lines[i+1:]...)
} else {
lines[i] = prefix + value
}
return strings.Join(lines, "\n")
}
}
if value == "" {
return script
}
insertAt := len(lines)
for i, line := range lines {
if strings.EqualFold(strings.TrimSpace(strings.TrimRight(line, "\r")), "@echo off") {
insertAt = i + 1
break
}
}
lines = append(lines[:insertAt], append([]string{prefix + value}, lines[insertAt:]...)...)
return strings.Join(lines, "\n")
}
func (m WindowsManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig) error {
req := cfg.Request
if strings.TrimSpace(req.InstallType) == "" || req.InstallType == DefaultUpdateInstallType {
@@ -141,6 +259,9 @@ func (m WindowsManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig)
if err := req.Validate(); err != nil {
return err
}
if err := ReconcileSignedUpdateState(req.StateDir); err != nil {
return err
}
if cfg.Interval == 0 {
cfg.Interval = time.Hour
}
@@ -179,6 +300,7 @@ func (m WindowsManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig)
continue
}
logf("windows_update_loop run=%d status=failed error=%v", runs, err)
saveUpdateLoopRescueState(req, "windows_node_agent_update_failed", err)
if cfg.StopOnError {
return err
}
@@ -197,10 +319,12 @@ func (m WindowsManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig)
}
if cfg.HostAgentUpdateEnabled {
hostReq := cfg.HostAgentUpdateRequest
hostReq.BackendURL = firstNonEmpty(hostReq.BackendURL, req.BackendURL)
hostReq.ClusterID = firstNonEmpty(hostReq.ClusterID, req.ClusterID)
hostReq.NodeID = firstNonEmpty(hostReq.NodeID, req.NodeID)
hostReq.StateDir = firstNonEmpty(hostReq.StateDir, req.StateDir)
hostReq.ClusterAuthorityPublicKey = firstNonEmpty(hostReq.ClusterAuthorityPublicKey, req.ClusterAuthorityPublicKey)
hostReq.FabricRegistryRecordsJSON = firstNonEmpty(hostReq.FabricRegistryRecordsJSON, req.FabricRegistryRecordsJSON)
hostReq.MeshRegion = firstNonEmpty(hostReq.MeshRegion, req.MeshRegion)
hostReq.Channel = firstNonEmpty(hostReq.Channel, req.Channel)
hostReq.OS = firstNonEmpty(hostReq.OS, "windows")
hostReq.Arch = firstNonEmpty(hostReq.Arch, "amd64")
@@ -211,6 +335,7 @@ func (m WindowsManager) RunUpdateLoop(ctx context.Context, cfg UpdateLoopConfig)
logf("windows_host_agent_update_loop run=%d status=waiting_for_node_identity state_dir=%s", runs, hostReq.StateDir)
} else {
logf("windows_host_agent_update_loop run=%d status=failed error=%v", runs, hostErr)
saveUpdateLoopRescueState(req, "windows_host_agent_update_failed", hostErr)
if cfg.StopOnError {
return hostErr
}
@@ -257,7 +382,7 @@ func installWindowsHostAgentUpdater(ctx context.Context, m WindowsManager, resul
if err := os.WriteFile(wrapperPath, []byte(script), 0o755); err != nil {
return result, err
}
started, fallback, mode, err := m.installStartupTask(ctx, taskName, wrapperPath, logPath, cfg.StartupMode)
started, fallback, mode, err := m.installRecurringUpdaterTask(ctx, taskName, wrapperPath, logPath, cfg.StartupMode, windowsUpdaterWakeIntervalSeconds(cfg.AutoUpdateIntervalSeconds))
if err != nil {
return result, err
}
@@ -277,7 +402,7 @@ func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig
currentVersion := firstNonEmpty(cfg.AutoUpdateCurrentVersion, "0.0.0")
interval := cfg.AutoUpdateIntervalSeconds
if interval == 0 {
interval = 21600
interval = DefaultUpdateIntervalSec
}
initialDelay := cfg.AutoUpdateInitialDelaySeconds
if initialDelay == 0 {
@@ -290,6 +415,7 @@ func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig
updateLoopArgs := []string{
`"` + hostAgentPath + `"`,
"update-loop",
"--max-runs", "1",
"--cluster-id", `"` + cfg.RuntimeConfig.ClusterID + `"`,
"--state-dir", `"` + result.StateDir + `"`,
"--current-version", currentVersion,
@@ -305,10 +431,7 @@ func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig
"--host-agent-current-version", currentVersion,
"--host-agent-binary-path", `"` + hostAgentPath + `"`,
}
if strings.TrimSpace(cfg.RuntimeConfig.BackendURL) != "" {
updateLoopArgs = append(updateLoopArgs, "--backend-url", `"`+strings.TrimSpace(cfg.RuntimeConfig.BackendURL)+`"`)
}
updateLoopArgs = appendFabricUpdateArgs(updateLoopArgs, cfg.RuntimeConfig)
updateLoopArgs = appendFabricUpdateArgs(updateLoopArgs, cfg.RuntimeConfig, true)
if strings.TrimSpace(cfg.NodeID) != "" {
updateLoopArgs = append(updateLoopArgs, "--node-id", `"`+strings.TrimSpace(cfg.NodeID)+`"`)
}
@@ -320,21 +443,70 @@ func windowsHostAgentUpdateScript(hostAgentPath string, cfg WindowsInstallConfig
"setlocal",
"set RAP_HOST_AGENT=" + `"` + hostAgentPath + `"`,
"set RAP_HOST_AGENT_NEXT=" + `"` + hostAgentPath + `.next"`,
}
if initialDelay > 0 {
lines = append(lines, "timeout /t "+fmt.Sprintf("%d", initialDelay)+" /nobreak >NUL")
"set RAP_HOST_AGENT_UPDATE_LOCK_DIR=" + `"` + filepath.Join(result.StateDir, "rap-host-agent-update.lock") + `"`,
}
lines = append(lines, []string{
":loop",
"2>nul mkdir %RAP_HOST_AGENT_UPDATE_LOCK_DIR%",
"if errorlevel 1 goto :eof",
"if exist %RAP_HOST_AGENT_NEXT% (",
" copy /Y %RAP_HOST_AGENT_NEXT% %RAP_HOST_AGENT% >NUL",
" if %ERRORLEVEL% EQU 0 del /F /Q %RAP_HOST_AGENT_NEXT%",
")",
}...)
if initialDelay > 0 {
lines = append(lines, "timeout /t "+fmt.Sprintf("%d", initialDelay)+" /nobreak >NUL")
}
lines = append(lines, []string{
strings.Join(updateLoopArgs, " "),
"timeout /t " + fmt.Sprintf("%d", interval) + " /nobreak >NUL",
"goto loop",
"endlocal",
"set RAP_HOST_AGENT_UPDATE_EXIT_CODE=%ERRORLEVEL%",
"rmdir /S /Q %RAP_HOST_AGENT_UPDATE_LOCK_DIR% >NUL 2>&1",
"endlocal & exit /b %RAP_HOST_AGENT_UPDATE_EXIT_CODE%",
"rem initial-delay-seconds " + fmt.Sprintf("%d", initialDelay),
"rem wake-interval-seconds " + strconv.Itoa(windowsUpdaterWakeIntervalSeconds(interval)),
}...)
return strings.Join(lines, "\r\n") + "\r\n"
}
func windowsUpdaterWakeIntervalSeconds(intervalSeconds int) int {
if intervalSeconds <= 0 {
return 300
}
if intervalSeconds > 300 {
return 300
}
return intervalSeconds
}
func (m WindowsManager) installRecurringUpdaterTask(ctx context.Context, taskName, wrapperPath, logPath, mode string, intervalSeconds int) (bool, bool, string, error) {
if strings.EqualFold(mode, "none") {
return false, false, mode, nil
}
runner := m.Runner
if runner == nil {
runner = ExecRunner{}
}
intervalMinutes := intervalSeconds / 60
if intervalSeconds%60 != 0 {
intervalMinutes++
}
if intervalMinutes <= 0 {
intervalMinutes = 1
}
action := windowsTaskAction(wrapperPath, logPath)
if mode == "auto" || mode == "system-task" {
_, err := runner.Run(ctx, "schtasks", "/Create", "/TN", taskName, "/SC", "MINUTE", "/MO", strconv.Itoa(intervalMinutes), "/RU", "SYSTEM", "/RL", "HIGHEST", "/TR", action, "/F")
if err == nil {
_, _ = runner.Run(ctx, "schtasks", "/Run", "/TN", taskName)
return true, false, "system-task", nil
}
if mode == "system-task" {
return false, false, mode, err
}
}
_, err := runner.Run(ctx, "schtasks", "/Create", "/TN", taskName, "/SC", "MINUTE", "/MO", strconv.Itoa(intervalMinutes), "/TR", action, "/F")
if err != nil {
return false, mode == "auto", "user-task", err
}
_, _ = runner.Run(ctx, "schtasks", "/Run", "/TN", taskName)
return true, mode == "auto", "user-task", nil
}
@@ -1,111 +0,0 @@
package mesh
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"time"
)
type Client struct {
BaseURL string
HTTPClient *http.Client
}
func NewClient(baseURL string) Client {
return Client{
BaseURL: baseURL,
HTTPClient: &http.Client{
Timeout: 5 * time.Second,
},
}
}
func (c Client) SendHealth(ctx context.Context, message HealthMessage) (HealthAck, error) {
payload, err := json.Marshal(message)
if err != nil {
return HealthAck{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/health", bytes.NewReader(payload))
if err != nil {
return HealthAck{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return HealthAck{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return HealthAck{}, fmt.Errorf("mesh health rejected with status %d", resp.StatusCode)
}
var ack HealthAck
if err := json.NewDecoder(resp.Body).Decode(&ack); err != nil {
return HealthAck{}, err
}
return ack, nil
}
func (c Client) SendSynthetic(ctx context.Context, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
payload, err := json.Marshal(envelope)
if err != nil {
return SyntheticEnvelope{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/synthetic/probe", bytes.NewReader(payload))
if err != nil {
return SyntheticEnvelope{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return SyntheticEnvelope{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return SyntheticEnvelope{}, fmt.Errorf("mesh synthetic probe rejected with status %d", resp.StatusCode)
}
var ack SyntheticEnvelope
if err := json.NewDecoder(resp.Body).Decode(&ack); err != nil {
return SyntheticEnvelope{}, err
}
return ack, nil
}
func (c Client) SendProduction(ctx context.Context, envelope ProductionEnvelope) (ProductionForwardResult, error) {
payload, err := json.Marshal(envelope)
if err != nil {
return ProductionForwardResult{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/forward", bytes.NewReader(payload))
if err != nil {
return ProductionForwardResult{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return ProductionForwardResult{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return ProductionForwardResult{}, fmt.Errorf("mesh production forward rejected with status %d", resp.StatusCode)
}
var result ProductionForwardResult
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return ProductionForwardResult{}, err
}
return result, nil
}
@@ -70,7 +70,7 @@ const (
FabricServiceChannelReliable = "reliable"
FabricServiceChannelDroppable = "droppable"
MaxProductionEnvelopePayloadBytes = 4096
MaxProductionVPNPacketPayloadBytes = 256 * 1024
MaxProductionVPNPacketPayloadBytes = 8 * 1024 * 1024
MaxProductionEnvelopeFutureSkew = time.Minute
ProductionForwardQUICStreamID = 1
WebIngressForwardQUICStreamID = 2
@@ -203,22 +203,6 @@ type SyntheticRelayQueueMetrics struct {
QueueDepths map[string]int `json:"queue_depths"`
}
type HealthMessage struct {
ProtocolVersion string `json:"protocol_version"`
From PeerIdentity `json:"from"`
To PeerIdentity `json:"to"`
ObservedAt time.Time `json:"observed_at"`
LinkStatus string `json:"link_status"`
LatencyMs *int `json:"latency_ms,omitempty"`
QualityScore *int `json:"quality_score,omitempty"`
}
type HealthAck struct {
ProtocolVersion string `json:"protocol_version"`
Accepted bool `json:"accepted"`
By PeerIdentity `json:"by"`
}
type ProductionEnvelope struct {
FabricProtocolVersion string `json:"fabric_protocol_version"`
MessageID string `json:"message_id"`
@@ -1,6 +1,7 @@
package mesh
import (
"encoding/json"
"sort"
"strings"
"time"
@@ -9,6 +10,9 @@ import (
type EndpointCandidateScoreOptions struct {
ChannelClass string
PreferredRegion string
SiteID string
LocalityGroupID string
LocalNATGroupID string
Now time.Time
MaxVerificationAge time.Duration
Observations map[string]EndpointCandidateHealthObservation
@@ -21,6 +25,7 @@ type EndpointCandidateHealthObservation struct {
EndpointID string `json:"endpoint_id"`
Source string `json:"source,omitempty"`
ReporterNodeID string `json:"reporter_node_id,omitempty"`
ReporterRegion string `json:"reporter_region,omitempty"`
LastLatencyMs int64 `json:"last_latency_ms,omitempty"`
SuccessCount uint64 `json:"success_count,omitempty"`
FailureCount uint64 `json:"failure_count,omitempty"`
@@ -114,6 +119,9 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
case "direct":
score += 30
reasons = append(reasons, "connectivity:direct")
case "private_lan":
score += 36
reasons = append(reasons, "connectivity:private_lan")
case "outbound_only":
score += 5
reasons = append(reasons, "connectivity:outbound_only")
@@ -167,6 +175,7 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
score += 18
reasons = append(reasons, "policy:private-lan")
}
score, reasons = applyLocalityPreferences(candidate, opts, score, reasons)
if hasPolicyTag(candidate.PolicyTags, "costly") {
score -= 10
reasons = append(reasons, "policy:costly")
@@ -193,7 +202,7 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
}
}
if observation, ok := opts.Observations[candidate.EndpointID]; ok {
observationScore, observationReasons := scoreEndpointCandidateObservation(observation, opts)
observationScore, observationReasons := scoreEndpointCandidateObservation(candidate, observation, opts)
score += observationScore
reasons = append(reasons, observationReasons...)
}
@@ -225,7 +234,7 @@ func scoreEndpointCandidateCapacityPressure(pressure EndpointCandidateCapacityPr
return -penalty, []string{"capacity:pressure"}
}
func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObservation, opts EndpointCandidateScoreOptions) (int, []string) {
func scoreEndpointCandidateObservation(candidate PeerEndpointCandidate, observation EndpointCandidateHealthObservation, opts EndpointCandidateScoreOptions) (int, []string) {
score := 0
reasons := []string{"observation:present"}
if !opts.Now.IsZero() && !observation.ObservedAt.IsZero() && opts.MaxObservationAge > 0 {
@@ -236,6 +245,18 @@ func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObserv
score += 6
reasons = append(reasons, "observation:fresh")
}
observationScope := endpointCandidateObservationScope(candidate, observation, opts)
if observationScope != "" {
reasons = append(reasons, "observation_scope:"+observationScope)
}
if endpointRequiresExternalNetworkVerification(candidate) && (observationScope == "self" || observationScope == "same_area") {
reasons = append(reasons, "observation:non_authoritative_same_area_public")
if strings.TrimSpace(observation.LastFailureReason) == "capacity_limited" {
score -= 4
reasons = append(reasons, "capacity:limited")
}
return score, reasons
}
switch {
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 50:
score += 24
@@ -286,6 +307,118 @@ func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObserv
return score, reasons
}
func endpointCandidateObservationScope(candidate PeerEndpointCandidate, observation EndpointCandidateHealthObservation, opts EndpointCandidateScoreOptions) string {
if strings.TrimSpace(observation.ReporterNodeID) != "" &&
strings.TrimSpace(candidate.NodeID) != "" &&
strings.EqualFold(strings.TrimSpace(observation.ReporterNodeID), strings.TrimSpace(candidate.NodeID)) {
return "self"
}
reporterRegion := strings.TrimSpace(observation.ReporterRegion)
if reporterRegion == "" && strings.EqualFold(strings.TrimSpace(observation.Source), "local_vpn_fabric_session") {
reporterRegion = strings.TrimSpace(opts.PreferredRegion)
}
candidateRegion := strings.TrimSpace(candidate.Region)
if reporterRegion == "" || candidateRegion == "" {
return ""
}
if strings.EqualFold(reporterRegion, candidateRegion) {
return "same_area"
}
return "cross_area"
}
func endpointRequiresExternalNetworkVerification(candidate PeerEndpointCandidate) bool {
if !strings.EqualFold(strings.TrimSpace(candidate.Reachability), "public") {
return false
}
if len(candidate.Metadata) == 0 || !json.Valid(candidate.Metadata) {
return false
}
var metadata struct {
VerificationScope string `json:"verification_scope,omitempty"`
}
if err := json.Unmarshal(candidate.Metadata, &metadata); err != nil {
return false
}
return strings.EqualFold(strings.TrimSpace(metadata.VerificationScope), "external-network-required")
}
func applyLocalityPreferences(candidate PeerEndpointCandidate, opts EndpointCandidateScoreOptions, score int, reasons []string) (int, []string) {
locality := endpointCandidateLocality(candidate, opts)
switch locality {
case "local_segment":
score += 65
reasons = append(reasons, "locality:local_segment")
case "same_nat":
score += 45
reasons = append(reasons, "locality:same_nat")
case "private_scoped":
score += 20
reasons = append(reasons, "locality:private_scoped")
case "private_unscoped":
score -= 35
reasons = append(reasons, "locality:private_unscoped")
case "private_foreign":
score -= 90
reasons = append(reasons, "locality:private_foreign")
case "public_fallback":
score -= 5
reasons = append(reasons, "locality:public_fallback")
}
return score, reasons
}
func endpointCandidateLocality(candidate PeerEndpointCandidate, opts EndpointCandidateScoreOptions) string {
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
isPrivate := reachability == "private" || connectivity == "private_lan" || endpointHasPrivateHost(candidate.Address)
if !isPrivate {
if reachability == "public" && endpointRequiresExternalNetworkVerification(candidate) {
return "public_fallback"
}
return ""
}
metadata := decodeEndpointCandidateLocalityMetadata(candidate.Metadata)
localityGroupID := strings.TrimSpace(opts.LocalityGroupID)
if localityGroupID != "" && strings.TrimSpace(metadata.LocalityGroupID) != "" &&
strings.EqualFold(strings.TrimSpace(metadata.LocalityGroupID), localityGroupID) {
return "local_segment"
}
if opts.LocalNATGroupID != "" && metadata.NATGroupID != "" && strings.EqualFold(metadata.NATGroupID, strings.TrimSpace(opts.LocalNATGroupID)) {
return "same_nat"
}
if strings.TrimSpace(opts.SiteID) != "" && metadata.SiteID != "" && strings.EqualFold(metadata.SiteID, strings.TrimSpace(opts.SiteID)) {
return "private_scoped"
}
if hasPolicyTag(candidate.PolicyTags, "private-lan") || hasPolicyTag(candidate.PolicyTags, "corp-lan") || hasPolicyTag(candidate.PolicyTags, "same-site") {
return "private_scoped"
}
if metadata.LocalityGroupID != "" || metadata.SiteID != "" || metadata.NATGroupID != "" {
return "private_foreign"
}
return "private_unscoped"
}
type endpointCandidateLocalityMetadata struct {
SiteID string `json:"site_id,omitempty"`
LocalityGroupID string `json:"locality_group_id,omitempty"`
NATGroupID string `json:"nat_group_id,omitempty"`
}
func decodeEndpointCandidateLocalityMetadata(raw json.RawMessage) endpointCandidateLocalityMetadata {
if len(raw) == 0 || !json.Valid(raw) {
return endpointCandidateLocalityMetadata{}
}
var metadata endpointCandidateLocalityMetadata
if err := json.Unmarshal(raw, &metadata); err != nil {
return endpointCandidateLocalityMetadata{}
}
metadata.SiteID = strings.TrimSpace(metadata.SiteID)
metadata.LocalityGroupID = strings.TrimSpace(metadata.LocalityGroupID)
metadata.NATGroupID = strings.TrimSpace(metadata.NATGroupID)
return metadata
}
func hasPolicyTag(tags []string, needle string) bool {
for _, tag := range tags {
if strings.EqualFold(strings.TrimSpace(tag), needle) {
@@ -1,6 +1,7 @@
package mesh
import (
"encoding/json"
"testing"
"time"
)
@@ -526,6 +527,161 @@ func TestRankPeerEndpointCandidatesSpreadsFreshCapacityPressure(t *testing.T) {
}
}
func TestRankPeerEndpointCandidatesIgnoresSameAreaPublicVerificationFailures(t *testing.T) {
now := time.Date(2026, 5, 19, 12, 0, 0, 0, time.UTC)
candidate := PeerEndpointCandidate{
EndpointID: "test-1-public",
NodeID: "test-1",
Transport: "direct_quic",
Address: "quic://94.141.118.222:19191",
Reachability: "public",
NATType: "port_restricted",
ConnectivityMode: "direct",
Region: "home-test",
Priority: 2,
Metadata: json.RawMessage(`{"verification_scope":"external-network-required"}`),
}
ranked := RankPeerEndpointCandidates([]PeerEndpointCandidate{candidate}, EndpointCandidateScoreOptions{
PreferredRegion: "home-test",
Now: now,
MaxObservationAge: time.Minute,
Observations: map[string]EndpointCandidateHealthObservation{
"test-1-public": {
EndpointID: "test-1-public",
ReporterNodeID: "home-1",
ReporterRegion: "home-test",
FailureCount: 4,
LastFailureReason: "context_deadline_exceeded",
ReliabilityScore: 20,
ObservedAt: now,
},
},
})
if len(ranked) != 1 {
t.Fatalf("ranked length = %d, want 1", len(ranked))
}
if !containsReason(ranked[0].Reasons, "observation:non_authoritative_same_area_public") {
t.Fatalf("same-area public observation should be non-authoritative: %+v", ranked[0].Reasons)
}
if containsReason(ranked[0].Reasons, "history:failure") || containsReason(ranked[0].Reasons, "failure:recent") {
t.Fatalf("same-area public failures should not demote candidate: %+v", ranked[0].Reasons)
}
}
func TestRankPeerEndpointCandidatesUsesCrossAreaPublicVerificationFailures(t *testing.T) {
now := time.Date(2026, 5, 19, 12, 0, 0, 0, time.UTC)
candidate := PeerEndpointCandidate{
EndpointID: "test-1-public",
NodeID: "test-1",
Transport: "direct_quic",
Address: "quic://94.141.118.222:19191",
Reachability: "public",
NATType: "port_restricted",
ConnectivityMode: "direct",
Region: "home-test",
Priority: 2,
Metadata: json.RawMessage(`{"verification_scope":"external-network-required"}`),
}
ranked := RankPeerEndpointCandidates([]PeerEndpointCandidate{candidate}, EndpointCandidateScoreOptions{
PreferredRegion: "usa",
Now: now,
MaxObservationAge: time.Minute,
Observations: map[string]EndpointCandidateHealthObservation{
"test-1-public": {
EndpointID: "test-1-public",
ReporterNodeID: "usa-los-1",
ReporterRegion: "usa",
FailureCount: 4,
LastFailureReason: "context_deadline_exceeded",
ReliabilityScore: 20,
ObservedAt: now,
},
},
})
if len(ranked) != 1 {
t.Fatalf("ranked length = %d, want 1", len(ranked))
}
if !containsReason(ranked[0].Reasons, "observation_scope:cross_area") {
t.Fatalf("cross-area scope missing: %+v", ranked[0].Reasons)
}
if !containsReason(ranked[0].Reasons, "history:failure") || !containsReason(ranked[0].Reasons, "failure:recent") {
t.Fatalf("cross-area public failures should demote candidate: %+v", ranked[0].Reasons)
}
}
func TestRankPeerEndpointCandidatesPrefersScopedPrivateLANOverPublic(t *testing.T) {
now := time.Date(2026, 5, 19, 13, 0, 0, 0, time.UTC)
ranked := RankPeerEndpointCandidates([]PeerEndpointCandidate{
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://94.141.118.222:19191",
Reachability: "public",
ConnectivityMode: "direct",
NATType: "port_restricted",
Priority: 2,
},
{
EndpointID: "node-b-private",
NodeID: "node-b",
Transport: "lan_quic",
Address: "quic://192.168.200.61:19134",
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 1,
Metadata: json.RawMessage(`{"locality_group_id":"home-test","nat_group_id":"home-router"}`),
},
}, EndpointCandidateScoreOptions{
PreferredRegion: "home-test",
LocalityGroupID: "home-test",
LocalNATGroupID: "home-router",
Now: now,
})
if ranked[0].Candidate.EndpointID != "node-b-private" {
t.Fatalf("top endpoint = %q, want node-b-private: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if !containsReason(ranked[0].Reasons, "locality:local_segment") {
t.Fatalf("missing locality group reason: %+v", ranked[0].Reasons)
}
}
func TestRankPeerEndpointCandidatesPenalizesForeignPrivateEndpoint(t *testing.T) {
now := time.Date(2026, 5, 19, 13, 0, 0, 0, time.UTC)
ranked := RankPeerEndpointCandidates([]PeerEndpointCandidate{
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://94.141.118.222:19191",
Reachability: "public",
ConnectivityMode: "direct",
Priority: 2,
},
{
EndpointID: "node-b-private-foreign",
NodeID: "node-b",
Transport: "lan_quic",
Address: "quic://10.24.10.20:19443",
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 1,
Metadata: json.RawMessage(`{"locality_group_id":"other-site","nat_group_id":"other-nat"}`),
},
}, EndpointCandidateScoreOptions{
PreferredRegion: "home-test",
LocalityGroupID: "home-test",
LocalNATGroupID: "home-router",
Now: now,
})
if ranked[0].Candidate.EndpointID != "node-b-public" {
t.Fatalf("top endpoint = %q, want node-b-public: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if !containsReason(ranked[1].Reasons, "locality:private_foreign") {
t.Fatalf("missing foreign private reason: %+v", ranked[1].Reasons)
}
}
func containsReason(reasons []string, reason string) bool {
for _, item := range reasons {
if item == reason {
@@ -23,7 +23,7 @@ func FabricTransportTargetFromRegistryEndpoint(endpoint FabricRegistryEndpoint)
return FabricTransportTarget{
EndpointID: strings.TrimSpace(endpoint.EndpointID),
PeerID: strings.TrimSpace(endpoint.EndpointID),
Endpoint: strings.TrimSpace(endpoint.Address),
Endpoint: fabricControlEndpointAddress(endpoint),
Transport: strings.TrimSpace(endpoint.Transport),
PeerCertSHA256: strings.TrimSpace(endpoint.PeerCertSHA256),
Timeout: 5 * time.Second,
@@ -32,6 +32,28 @@ func FabricTransportTargetFromRegistryEndpoint(endpoint FabricRegistryEndpoint)
}
}
func fabricControlEndpointAddress(endpoint FabricRegistryEndpoint) string {
if mapped := fabricControlMetadataString(endpoint.Metadata, "maps_to"); mapped != "" {
if strings.Contains(mapped, "://") {
return mapped
}
return "quic://" + mapped
}
return strings.TrimSpace(endpoint.Address)
}
func fabricControlMetadataString(raw json.RawMessage, key string) string {
if len(raw) == 0 {
return ""
}
var metadata map[string]any
if err := json.Unmarshal(raw, &metadata); err != nil {
return ""
}
value, _ := metadata[key].(string)
return strings.TrimSpace(value)
}
func SendFabricControlForward(ctx context.Context, transport FabricTransport, endpoint FabricRegistryEndpoint, payload []byte, timeout time.Duration) (FabricControlForwardResult, error) {
if transport == nil {
return FabricControlForwardResult{}, fmt.Errorf("fabric control transport is unavailable")
@@ -137,7 +137,7 @@ type FabricAdjacency struct {
PressurePercent int
Healthy bool
PassiveOutbound bool
LocalSegmentID string
LocalityGroupID string
NATGroupID string
LastObservedAt time.Time
LastFailureReason string
@@ -0,0 +1,74 @@
package mesh
import (
"context"
"fmt"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
func ProbeFabricTarget(ctx context.Context, target FabricTransportTarget) (time.Duration, error) {
target.Timeout = positiveDurationOr(target.Timeout, 2*time.Second)
target.InboundBuffer = positiveIntOr(target.InboundBuffer, 2)
target.ErrorBuffer = positiveIntOr(target.ErrorBuffer, 2)
transport, normalizedTarget, err := FabricTransportForTarget(target, nil)
if err != nil {
return 0, err
}
session, err := transport.Connect(ctx, normalizedTarget)
if err != nil {
_ = transport.Close()
return 0, err
}
defer func() {
_ = session.Close()
_ = transport.Close()
}()
startedAt := time.Now()
sequence := uint64(startedAt.UnixNano())
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FramePing,
TrafficClass: fabricproto.TrafficClassReliable,
Sequence: sequence,
Payload: []byte("fabric-live-probe"),
}); err != nil {
return 0, err
}
for {
select {
case frame, ok := <-session.Frames():
if !ok {
return 0, fmt.Errorf("fabric live probe session closed")
}
if frame.Type == fabricproto.FramePong && frame.Sequence == sequence {
return time.Since(startedAt), nil
}
case err, ok := <-session.Errors():
if !ok {
return 0, fmt.Errorf("fabric live probe error channel closed")
}
if err != nil {
return 0, err
}
case <-ctx.Done():
return 0, ctx.Err()
}
}
}
func positiveDurationOr(value time.Duration, fallback time.Duration) time.Duration {
if value > 0 {
return value
}
return fallback
}
func positiveIntOr(value int, fallback int) int {
if value > 0 {
return value
}
return fallback
}
@@ -59,7 +59,7 @@ func StartQUICFabricServer(ctx context.Context, cfg QUICFabricServerConfig) (*QU
if len(tlsConfig.NextProtos) == 0 {
tlsConfig.NextProtos = []string{fabricQUICNextProto}
}
listener, err := quic.ListenAddr(cfg.ListenAddr, tlsConfig, cfg.QUICConfig)
listener, err := quic.ListenAddr(cfg.ListenAddr, tlsConfig, defaultQUICFabricConfig(cfg.QUICConfig))
if err != nil {
return nil, err
}
@@ -132,7 +132,7 @@ func (s *QUICFabricServer) handleConn(ctx context.Context, conn *quic.Conn) {
func (s *QUICFabricServer) handleStream(ctx context.Context, conn *quic.Conn, stream *quic.Stream) {
session := fabricproto.NewSession(fabricproto.SessionConfig{})
sender := quicStreamFrameSender{stream: stream}
sender := &quicStreamFrameSender{stream: stream}
defer func() { _ = stream.Close() }()
s.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_quic_stream_opened",
@@ -207,7 +207,7 @@ type quicStreamFrameSender struct {
mu sync.Mutex
}
func (s quicStreamFrameSender) SendFrame(ctx context.Context, frame fabricproto.Frame) error {
func (s *quicStreamFrameSender) SendFrame(ctx context.Context, frame fabricproto.Frame) error {
if s.stream == nil {
return fmt.Errorf("quic fabric stream is closed")
}
@@ -22,6 +22,9 @@ const fabricQUICNextProto = "rap-fabric-data-session-v1"
const fabricQUICReverseHelloPrefix = "rap-fabric-reverse-hello-v1:"
const defaultQUICFabricConnIdleTTL = 5 * time.Minute
const defaultQUICFabricMaxStreamsPerConn = 64
const defaultQUICFabricHandshakeIdleTimeout = 8 * time.Second
const defaultQUICFabricMaxIdleTimeout = 90 * time.Second
const defaultQUICFabricKeepAlivePeriod = 15 * time.Second
const ErrQUICFabricStreamLimitReached = quicFabricError("quic fabric stream limit reached")
type quicFabricError string
@@ -31,20 +34,20 @@ func (e quicFabricError) Error() string {
}
type QUICFabricTransport struct {
Config *quic.Config
LocalPeerID string
IdleTTL time.Duration
MaxStreamsPerConn int
DialAddr func(context.Context, string, *tls.Config, *quic.Config) (*quic.Conn, error)
mu sync.Mutex
conns map[string]*quicFabricConnEntry
reverseConns map[string]*quicFabricConnEntry
inboundProductionHandler func(context.Context, ProductionEnvelope) (ProductionForwardResult, error)
inboundWebIngressHandler func(context.Context, []byte) ([]byte, error)
Config *quic.Config
LocalPeerID string
IdleTTL time.Duration
MaxStreamsPerConn int
DialAddr func(context.Context, string, *tls.Config, *quic.Config) (*quic.Conn, error)
mu sync.Mutex
conns map[string]*quicFabricConnEntry
reverseConns map[string]*quicFabricConnEntry
inboundProductionHandler func(context.Context, ProductionEnvelope) (ProductionForwardResult, error)
inboundWebIngressHandler func(context.Context, []byte) ([]byte, error)
inboundFabricControlHandler func(context.Context, []byte) ([]byte, error)
inboundSyntheticHandler func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error)
logger FabricSessionEventLogger
stats QUICFabricTransportStats
inboundSyntheticHandler func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error)
logger FabricSessionEventLogger
stats QUICFabricTransportStats
}
type QUICFabricTransportStats struct {
@@ -109,7 +112,25 @@ type quicFabricConnEntry struct {
}
func NewQUICFabricTransport(config *quic.Config) *QUICFabricTransport {
return &QUICFabricTransport{Config: config, IdleTTL: defaultQUICFabricConnIdleTTL, MaxStreamsPerConn: defaultQUICFabricMaxStreamsPerConn, conns: map[string]*quicFabricConnEntry{}, reverseConns: map[string]*quicFabricConnEntry{}}
return &QUICFabricTransport{Config: defaultQUICFabricConfig(config), IdleTTL: defaultQUICFabricConnIdleTTL, MaxStreamsPerConn: defaultQUICFabricMaxStreamsPerConn, conns: map[string]*quicFabricConnEntry{}, reverseConns: map[string]*quicFabricConnEntry{}}
}
func defaultQUICFabricConfig(config *quic.Config) *quic.Config {
out := &quic.Config{}
if config != nil {
clone := *config
out = &clone
}
if out.HandshakeIdleTimeout <= 0 {
out.HandshakeIdleTimeout = defaultQUICFabricHandshakeIdleTimeout
}
if out.MaxIdleTimeout <= 0 {
out.MaxIdleTimeout = defaultQUICFabricMaxIdleTimeout
}
if out.KeepAlivePeriod <= 0 {
out.KeepAlivePeriod = defaultQUICFabricKeepAlivePeriod
}
return out
}
func (t *QUICFabricTransport) SetInboundHandlers(production func(context.Context, ProductionEnvelope) (ProductionForwardResult, error), synthetic func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error), logger FabricSessionEventLogger) {
@@ -150,6 +171,7 @@ func quicTLSConfigForTarget(target FabricTransportTarget) *tls.Config {
expectedFingerprint := normalizeCertSHA256(target.PeerCertSHA256)
config := &tls.Config{NextProtos: []string{fabricQUICNextProto}}
if expectedFingerprint == "" {
config.InsecureSkipVerify = true
return config
}
config.InsecureSkipVerify = true
@@ -198,9 +220,12 @@ func (t *QUICFabricTransport) Connect(ctx context.Context, target FabricTranspor
stream, err := conn.OpenStreamSync(ctx)
if err != nil {
t.releaseStream(connKey)
t.evictConnByKey(connKey, conn)
t.evictConn(target, conn)
if closeConn {
_ = conn.CloseWithError(1, "open stream failed")
} else {
_ = conn.CloseWithError(1, "cached stream open failed")
}
return nil, err
}
@@ -680,8 +705,28 @@ func (t *QUICFabricTransport) evictConn(target FabricTransportTarget, conn *quic
t.mu.Unlock()
}
func (t *QUICFabricTransport) evictConnByKey(key string, conn *quic.Conn) {
if t == nil || key == "" || conn == nil {
return
}
t.mu.Lock()
defer t.mu.Unlock()
if strings.HasPrefix(key, "reverse\x00") {
peerID := strings.TrimPrefix(key, "reverse\x00")
if entry := t.reverseConns[peerID]; entry != nil && entry.conn == conn {
delete(t.reverseConns, peerID)
t.stats.ClosedEvicted++
}
return
}
if entry := t.conns[key]; entry != nil && entry.conn == conn {
delete(t.conns, key)
t.stats.ClosedEvicted++
}
}
func (t *QUICFabricTransport) pruneIdleLocked(now time.Time) {
if t == nil || len(t.conns) == 0 {
if t == nil {
return
}
ttl := t.IdleTTL
@@ -897,7 +942,13 @@ func (s *quicFabricSession) Send(ctx context.Context, frame fabricproto.Frame) e
s.writeMu.Lock()
defer s.writeMu.Unlock()
s.applyWriteDeadline(ctx)
return fabricproto.WriteFrame(s.stream, frame)
if err := fabricproto.WriteFrame(s.stream, frame); err != nil {
if s.transport != nil && s.conn != nil {
s.transport.evictConnByKey(s.connKey, s.conn)
}
return err
}
return nil
}
func (s *quicFabricSession) Frames() <-chan fabricproto.Frame {
@@ -21,7 +21,7 @@ const (
type FabricRoutePlannerConfig struct {
ClusterID string
LocalNodeID string
LocalSegmentID string
LocalityGroupID string
LocalNATGroupID string
DefaultCapacity int
RelayCapacity int
@@ -34,13 +34,13 @@ type FabricRoutePlannerConfig struct {
}
type FabricCandidateMetadata struct {
LocalSegmentID string `json:"local_segment_id,omitempty"`
NATGroupID string `json:"nat_group_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
ViaNodeID string `json:"via_node_id,omitempty"`
STUNServer string `json:"stun_server,omitempty"`
ICEFoundation string `json:"ice_foundation,omitempty"`
LocalityGroupID string `json:"locality_group_id,omitempty"`
NATGroupID string `json:"nat_group_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
ViaNodeID string `json:"via_node_id,omitempty"`
STUNServer string `json:"stun_server,omitempty"`
ICEFoundation string `json:"ice_foundation,omitempty"`
}
func FabricRouteSetForPeerEndpointCandidates(targetNodeID string, candidates []PeerEndpointCandidate, cfg FabricRoutePlannerConfig) FabricRouteSet {
@@ -141,7 +141,7 @@ func fabricRouteModeForPeerEndpointCandidate(candidate PeerEndpointCandidate, me
}
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
if sameLocalSegment(metadata, cfg) || sameNATGroup(metadata, cfg) {
if sameLocalityGroup(metadata, cfg) || sameNATGroup(metadata, cfg) {
return FabricRouteLAN
}
if reachability == FabricCandidateReachabilityRelay || connectivity == FabricConnectivityRelayRequired || strings.TrimSpace(metadata.RelayEndpoint) != "" {
@@ -240,12 +240,12 @@ func candidatePressureCount(endpointID string, cfg FabricRoutePlannerConfig) int
return 0
}
func sameLocalSegment(metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) bool {
localSegment := strings.TrimSpace(cfg.LocalSegmentID)
if localSegment == "" {
func sameLocalityGroup(metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) bool {
localityGroup := strings.TrimSpace(cfg.LocalityGroupID)
if localityGroup == "" {
return false
}
return strings.EqualFold(strings.TrimSpace(metadata.LocalSegmentID), localSegment)
return strings.EqualFold(strings.TrimSpace(metadata.LocalityGroupID), localityGroup)
}
func sameNATGroup(metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) bool {
@@ -7,7 +7,7 @@ import (
)
func TestFabricRouteSetForPeerEndpointCandidatesPrefersLocalLAN(t *testing.T) {
metadata, _ := json.Marshal(FabricCandidateMetadata{LocalSegmentID: "site-a", NATGroupID: "nat-a"})
metadata, _ := json.Marshal(FabricCandidateMetadata{LocalityGroupID: "home-lan", NATGroupID: "nat-a"})
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{
{
EndpointID: "node-b-public",
@@ -31,7 +31,7 @@ func TestFabricRouteSetForPeerEndpointCandidatesPrefersLocalLAN(t *testing.T) {
}, FabricRoutePlannerConfig{
ClusterID: "cluster-1",
LocalNodeID: "node-a",
LocalSegmentID: "site-a",
LocalityGroupID: "home-lan",
DefaultCapacity: 200,
Now: time.Unix(100, 0).UTC(),
})
@@ -172,7 +172,7 @@ func TestFabricRouteSetForPeerEndpointCandidatesRejectsNonQUIC(t *testing.T) {
ConnectivityMode: "direct",
},
{
EndpointID: "node-b-legacy-relay",
EndpointID: "node-b-compat-relay",
NodeID: "node-b",
Transport: "relay",
Address: "quic://node-r:19443",
@@ -180,7 +180,7 @@ func TestFabricRouteSetForPeerEndpointCandidatesRejectsNonQUIC(t *testing.T) {
ConnectivityMode: "relay_required",
},
{
EndpointID: "node-b-legacy-reverse",
EndpointID: "node-b-compat-reverse",
NodeID: "node-b",
Transport: "outbound_reverse",
Address: "quic://node-b:19443",
@@ -4,7 +4,6 @@ import (
"context"
"crypto/tls"
"fmt"
"net/http"
"strings"
"time"
@@ -30,7 +29,6 @@ type FabricTransportTarget struct {
Endpoint string
Transport string
Token string
Header http.Header
TLSConfig *tls.Config
PeerCertSHA256 string
Timeout time.Duration
@@ -11,6 +11,8 @@ const DefaultWarmPeerLimit = 8
type PeerCacheConfig struct {
Local PeerIdentity
LocalityGroupID string
LocalNATGroupID string
PeerEndpoints map[string]string
PeerEndpointCandidates map[string][]PeerEndpointCandidate
PeerEndpointObservations map[string]EndpointCandidateHealthObservation
@@ -59,11 +61,12 @@ type PeerCacheEntry struct {
BestCandidateScore int `json:"best_candidate_score,omitempty"`
BestScoreReasons []string `json:"best_score_reasons,omitempty"`
BestPeerCertSHA256 string `json:"best_peer_cert_sha256,omitempty"`
PublicIngressCount int `json:"public_ingress_count,omitempty"`
EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
RelayQUIC bool `json:"relay_quic"`
}
type peerCacheBuildEntry struct {
@@ -119,6 +122,8 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
scored := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
ChannelClass: SyntheticChannelFabricControl,
PreferredRegion: cfg.PreferredRegion,
LocalityGroupID: cfg.LocalityGroupID,
LocalNATGroupID: cfg.LocalNATGroupID,
Now: now,
MaxVerificationAge: time.Hour,
Observations: cfg.PeerEndpointObservations,
@@ -129,6 +134,7 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
for _, scoredCandidate := range scored {
entry.EndpointCandidates = append(entry.EndpointCandidates, scoredCandidate.Candidate)
}
entry.PublicIngressCount = publicIngressCountFromCandidates(entry.EndpointCandidates)
entry.BestCandidateID = scored[0].Candidate.EndpointID
entry.BestCandidateAddr = scored[0].Candidate.Address
entry.BestTransport = scored[0].Candidate.Transport
@@ -197,9 +203,9 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
entry.RendezvousLeaseID = lease.LeaseID
entry.RelayNodeID = lease.RelayNodeID
entry.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
entry.RelayControl = true
entry.RelayQUIC = true
entry.CandidateCount = maxInt(entry.CandidateCount, 1)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{firstNonEmpty(lease.ConnectivityMode, "relay_required"), "relay_control"})
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{firstNonEmpty(lease.ConnectivityMode, "relay_required"), "relay_quic"})
if useLeaseEndpoint {
if localRelay {
entry.BestTransport = "reverse_quic"
@@ -225,7 +231,7 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
entry.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
}
entry.EndpointCount = maxInt(entry.EndpointCount, 1)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{"relay_control"})
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{"relay_quic"})
}
}
out := make([]peerCacheBuildEntry, 0, len(entries))
@@ -334,13 +340,37 @@ func warmPeerPriority(entry peerCacheBuildEntry) int {
if entry.bestScore > 0 {
score += entry.bestScore
}
if entry.RelayControl {
if entry.RelayQUIC {
score += 300
}
if entry.PublicIngressCount > 0 {
score += entry.PublicIngressCount * 75
}
score += entry.CandidateCount
return score
}
func publicIngressCountFromCandidates(candidates []PeerEndpointCandidate) int {
if len(candidates) == 0 {
return 0
}
distinct := map[string]struct{}{}
for _, candidate := range candidates {
if strings.ToLower(strings.TrimSpace(candidate.Reachability)) != "public" {
continue
}
if !strings.Contains(strings.ToLower(strings.TrimSpace(candidate.Transport)), "quic") {
continue
}
address := strings.TrimSpace(candidate.Address)
if address == "" {
continue
}
distinct[address] = struct{}{}
}
return len(distinct)
}
func warmPeerReason(entry peerCacheBuildEntry) string {
if entry.adjacentRoutePeer {
return "route_adjacent"
@@ -348,7 +378,7 @@ func warmPeerReason(entry peerCacheBuildEntry) string {
if entry.RecoverySeed {
return "recovery_seed"
}
if entry.RelayControl {
if entry.RelayQUIC {
return "rendezvous_lease"
}
if entry.BestCandidateID != "" {
@@ -98,6 +98,9 @@ func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) {
if entry.BestCandidateID != "node-b-public" || !entry.Warm {
t.Fatalf("unexpected candidate selection: %+v", entry)
}
if entry.PublicIngressCount != 1 {
t.Fatalf("public ingress count = %d, want 1", entry.PublicIngressCount)
}
}
func TestPeerCacheAppliesEndpointHealthObservations(t *testing.T) {
@@ -224,3 +227,12 @@ func peerCacheEntryByID(snapshot PeerCacheSnapshot, nodeID string) (PeerCacheEnt
}
return PeerCacheEntry{}, false
}
func containsString(values []string, want string) bool {
for _, value := range values {
if value == want {
return true
}
}
return false
}
@@ -21,7 +21,7 @@ const (
PeerTransportModeCorporateLAN = "corporate_lan"
PeerTransportModeOutboundOnly = "outbound_only"
PeerTransportModeRelayRequired = "relay_required"
PeerTransportModeRelayControl = "relay_control"
PeerTransportModeRelayQUIC = "relay_quic"
PeerTransportModeUnknown = "unknown"
)
@@ -44,7 +44,7 @@ type PeerConnectionIntentPlan struct {
CorporateLANCount int `json:"corporate_lan_count"`
OutboundOnlyCount int `json:"outbound_only_count"`
RelayRequiredCount int `json:"relay_required_count"`
RelayControlCount int `json:"relay_control_count"`
RelayQUICCount int `json:"relay_quic_count"`
RendezvousRequiredCount int `json:"rendezvous_required_count"`
RendezvousResolvedCount int `json:"rendezvous_resolved_count"`
RendezvousLeaseCount int `json:"rendezvous_lease_count"`
@@ -113,8 +113,8 @@ func PlanPeerConnectionIntents(cfg PeerConnectionIntentPlanConfig) PeerConnectio
RendezvousLeaseID: entry.RendezvousLeaseID,
RelayNodeID: entry.RelayNodeID,
RelayEndpoint: entry.RelayEndpoint,
RelayCandidate: entry.RelayControl,
ControlPlaneOnly: entry.RelayControl,
RelayCandidate: entry.RelayQUIC,
ControlPlaneOnly: entry.RelayQUIC,
RecoverySeed: candidate.RecoverySeed || entry.RecoverySeed,
Priority: candidate.Priority,
GeneratedAt: now,
@@ -163,8 +163,8 @@ func PlanPeerConnectionIntents(cfg PeerConnectionIntentPlanConfig) PeerConnectio
plan.OutboundOnlyCount++
case PeerTransportModeRelayRequired:
plan.RelayRequiredCount++
case PeerTransportModeRelayControl:
plan.RelayControlCount++
case PeerTransportModeRelayQUIC:
plan.RelayQUICCount++
}
if intent.RequiresRendezvous {
plan.RendezvousRequiredCount++
@@ -266,7 +266,7 @@ func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLeas
} else {
intent.Transport = firstNonEmpty(lease.Transport, "relay_quic")
}
intent.TransportMode = PeerTransportModeRelayControl
intent.TransportMode = PeerTransportModeRelayQUIC
intent.RequiresRendezvous = false
intent.RendezvousResolved = true
intent.DirectCandidate = false
@@ -170,11 +170,11 @@ func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) {
Now: now,
})
if plan.IntentCount != 1 || plan.RelayControlCount != 1 || plan.RendezvousResolvedCount != 1 || plan.RendezvousRequiredCount != 0 {
if plan.IntentCount != 1 || plan.RelayQUICCount != 1 || plan.RendezvousResolvedCount != 1 || plan.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected relay-control plan counts: %+v", plan)
}
intent := plan.Intents[0]
if intent.TransportMode != PeerTransportModeRelayControl ||
if intent.TransportMode != PeerTransportModeRelayQUIC ||
intent.Endpoint != "quic://node-r:19443" ||
intent.RelayNodeID != "node-r" ||
intent.RendezvousLeaseID != "lease-node-b-via-node-r" ||
@@ -239,7 +239,7 @@ func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T
Now: now,
})
if plan.RendezvousResolvedCount != 1 || plan.RelayControlCount != 1 || plan.RendezvousRequiredCount != 0 {
if plan.RendezvousResolvedCount != 1 || plan.RelayQUICCount != 1 || plan.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected reselected plan counts: %+v", plan)
}
intent := plan.Intents[0]
@@ -3,7 +3,6 @@ package mesh
import (
"context"
"fmt"
"net/http"
"strings"
"sync"
"time"
@@ -25,7 +24,6 @@ type PeerConnectionManagerConfig struct {
PeerCache *PeerCache
Tracker *PeerConnectionTracker
RendezvousLeases []PeerRendezvousLease
HTTPClient *http.Client
QUICTransport *QUICFabricTransport
PreferredRegion string
ProbeTimeout time.Duration
@@ -37,7 +35,6 @@ type PeerConnectionManager struct {
peerCache *PeerCache
tracker *PeerConnectionTracker
rendezvousLeases []PeerRendezvousLease
httpClient *http.Client
quicTransport *QUICFabricTransport
preferredRegion string
probeTimeout time.Duration
@@ -60,7 +57,7 @@ type PeerConnectionManagerCycle struct {
Skipped int `json:"skipped"`
RendezvousRequiredCount int `json:"rendezvous_required_count"`
RendezvousResolvedCount int `json:"rendezvous_resolved_count"`
RelayControlCount int `json:"relay_control_count"`
RelayQUICCount int `json:"relay_quic_count"`
RecoveryPlan PeerRecoveryPlan `json:"recovery_plan"`
IntentPlan PeerConnectionIntentPlan `json:"intent_plan"`
Results []PeerConnectionProbeResult `json:"results,omitempty"`
@@ -117,17 +114,6 @@ func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionMa
if probeTimeout <= 0 {
probeTimeout = DefaultPeerConnectionProbeTimeout
}
httpClient := cfg.HTTPClient
if httpClient == nil {
httpClient = &http.Client{
Transport: &http.Transport{
MaxIdleConns: 64,
MaxIdleConnsPerHost: 8,
IdleConnTimeout: 90 * time.Second,
},
Timeout: probeTimeout + time.Second,
}
}
now := cfg.Now
if now == nil {
now = func() time.Time { return time.Now().UTC() }
@@ -137,7 +123,6 @@ func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionMa
peerCache: cfg.PeerCache,
tracker: cfg.Tracker,
rendezvousLeases: append([]PeerRendezvousLease{}, cfg.RendezvousLeases...),
httpClient: httpClient,
quicTransport: cfg.QUICTransport,
preferredRegion: strings.TrimSpace(cfg.PreferredRegion),
probeTimeout: probeTimeout,
@@ -157,6 +142,7 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
Connections: m.tracker.Snapshot(),
TargetReadyPeers: DefaultStablePeerTarget,
MaxProbeCandidates: DefaultRecoveryProbeLimit,
PreferredRegion: m.preferredRegion,
Now: startedAt,
})
intentPlan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
@@ -177,7 +163,7 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
IntentCount: intentPlan.IntentCount,
RendezvousRequiredCount: intentPlan.RendezvousRequiredCount,
RendezvousResolvedCount: intentPlan.RendezvousResolvedCount,
RelayControlCount: intentPlan.RelayControlCount,
RelayQUICCount: intentPlan.RelayQUICCount,
RecoveryPlan: recoveryPlan,
IntentPlan: intentPlan,
Results: make([]PeerConnectionProbeResult, 0, len(intentPlan.Intents)),
@@ -270,7 +256,7 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
RendezvousLeaseID: intent.RendezvousLeaseID,
RelayNodeID: intent.RelayNodeID,
RelayEndpoint: intent.RelayEndpoint,
RelayControl: intent.RelayCandidate,
RelayQUIC: intent.RelayCandidate,
BestPeerCertSHA256: firstNonEmpty(intent.BestPeerCertSHA256, cacheEntry.BestPeerCertSHA256),
}
if intent.RequiresRendezvous {
@@ -385,7 +371,7 @@ func peerConnectionProbeTargetNodeID(intent PeerConnectionIntent, localNodeID st
func (m *PeerConnectionManager) probePeerTarget(ctx context.Context, probePeer PeerCacheEntry, target PeerIdentity) error {
endpoint := strings.TrimRight(strings.TrimSpace(probePeer.Endpoint), "/")
transport := strings.TrimSpace(probePeer.BestTransport)
if hasLegacyEndpointScheme(endpoint) {
if hasUnsupportedEndpointScheme(endpoint) {
return fmt.Errorf("non_quic_probe_rejected")
}
if peerConnectionTargetIsQUIC(transport, endpoint) {
@@ -445,7 +431,7 @@ func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCach
}
add(candidate.EndpointID, candidate.Address, candidate.Transport, candidatePeerCertSHA256(candidate))
}
add(intent.BestCandidateID, intent.Endpoint, intent.Transport, cacheEntry.BestPeerCertSHA256)
add(intent.BestCandidateID, intent.Endpoint, intent.Transport, intent.BestPeerCertSHA256)
return out
}
@@ -455,7 +441,7 @@ func peerConnectionShouldProbeDirectUpgrade(intent PeerConnectionIntent, cacheEn
}
if strings.TrimSpace(intent.ConnectionState) != PeerConnectionRelayReady &&
!intent.RelayCandidate &&
strings.TrimSpace(intent.TransportMode) != PeerTransportModeRelayControl {
strings.TrimSpace(intent.TransportMode) != PeerTransportModeRelayQUIC {
return false
}
for _, candidate := range cacheEntry.EndpointCandidates {
@@ -509,8 +495,3 @@ func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionSta
}
return PeerConnectionState{NodeID: nodeID, State: PeerConnectionDisconnected}
}
func (c Client) withHTTPClient(httpClient *http.Client) Client {
c.HTTPClient = httpClient
return c
}
@@ -3,7 +3,6 @@ package mesh
import (
"context"
"encoding/json"
"net/http"
"testing"
"time"
)
@@ -90,7 +89,7 @@ func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testin
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpoints: map[string]string{
"node-b": "http://127.0.0.1:1",
"node-b": "quic://127.0.0.1:1",
},
WarmPeerLimit: 1,
Now: now,
@@ -100,7 +99,6 @@ func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testin
Local: local,
PeerCache: cache,
Tracker: tracker,
HTTPClient: &http.Client{Timeout: 20 * time.Millisecond},
ProbeTimeout: 20 * time.Millisecond,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
@@ -121,7 +119,7 @@ func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testin
}
}
func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
func TestPeerConnectionManagerProbesRelayQUICLease(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
current := now
tlsConfig := testQUICTLSConfig(t)
@@ -188,7 +186,7 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
if cycle.Attempted != 1 ||
cycle.Succeeded != 1 ||
cycle.Deferred != 0 ||
cycle.RelayControlCount != 1 ||
cycle.RelayQUICCount != 1 ||
cycle.RendezvousResolvedCount != 1 ||
cycle.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected relay-control cycle: %+v", cycle)
@@ -227,11 +225,11 @@ func TestPeerConnectionProbeTargetsFallsBackToBestPeerCertSHA256(t *testing.T) {
BestPeerCertSHA256: "intent-cert",
}
cacheEntry := PeerCacheEntry{
NodeID: "node-b",
BestPeerCertSHA256: "cache-cert",
BestCandidateID: "node-b-best",
BestTransport: "direct_quic",
Endpoint: "quic://94.141.118.222:19199",
NodeID: "node-b",
BestPeerCertSHA256: "cache-cert",
BestCandidateID: "node-b-best",
BestTransport: "direct_quic",
Endpoint: "quic://94.141.118.222:19199",
EndpointCandidates: []PeerEndpointCandidate{
{
EndpointID: "node-b-public",
@@ -259,6 +257,49 @@ func TestPeerConnectionProbeTargetsFallsBackToBestPeerCertSHA256(t *testing.T) {
}
}
func TestPeerConnectionProbeTargetsUsesRelayLeaseCertForRelayEndpoint(t *testing.T) {
intent := PeerConnectionIntent{
NodeID: "node-b",
BestCandidateID: "lease-node-b-via-node-r",
Endpoint: "quic://195.123.240.88:19131",
Transport: "relay_quic",
BestPeerCertSHA256: "relay-cert",
RelayCandidate: true,
ConnectionState: PeerConnectionBackoff,
}
cacheEntry := PeerCacheEntry{
NodeID: "node-b",
BestPeerCertSHA256: "direct-cert",
EndpointCandidates: []PeerEndpointCandidate{
{
EndpointID: "node-b-private",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://192.168.200.61:19132",
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 1,
Metadata: peerConnectionProbeMetadata(t, "direct-cert"),
},
},
}
targets := peerConnectionProbeTargets(intent, cacheEntry)
if len(targets) != 2 {
t.Fatalf("target count = %d, want 2", len(targets))
}
for _, target := range targets {
if target.Endpoint != "quic://195.123.240.88:19131" {
continue
}
if target.PeerCertSHA256 != "relay-cert" {
t.Fatalf("relay endpoint cert = %q, want relay-cert", target.PeerCertSHA256)
}
return
}
t.Fatalf("relay endpoint target not found: %+v", targets)
}
func TestPeerConnectionProbeTargetsUpgradeRelayReadyPeerToDirectQUIC(t *testing.T) {
now := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
current := now
@@ -36,7 +36,7 @@ type PeerConnectionState struct {
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
RelayQUIC bool `json:"relay_quic"`
ConsecutiveSuccesses int `json:"consecutive_successes"`
ConsecutiveFailures int `json:"consecutive_failures"`
LastLatencyMs int `json:"last_latency_ms,omitempty"`
@@ -287,7 +287,7 @@ func (t *PeerConnectionTracker) entry(peer PeerCacheEntry, now time.Time) PeerCo
entry.RendezvousLeaseID = peer.RendezvousLeaseID
entry.RelayNodeID = peer.RelayNodeID
entry.RelayEndpoint = peer.RelayEndpoint
entry.RelayControl = peer.RelayControl
entry.RelayQUIC = peer.RelayQUIC
return entry
}
@@ -21,6 +21,7 @@ type PeerRecoveryPlanConfig struct {
Connections PeerConnectionSnapshot
TargetReadyPeers int
MaxProbeCandidates int
PreferredRegion string
Now time.Time
}
@@ -42,6 +43,7 @@ type PeerRecoveryPlan struct {
type PeerRecoveryCandidate struct {
NodeID string `json:"node_id"`
Endpoint string `json:"endpoint,omitempty"`
Region string `json:"region,omitempty"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
@@ -57,6 +59,7 @@ type PeerRecoveryCandidate struct {
type peerRecoveryCandidateBuild struct {
PeerRecoveryCandidate
PublicIngressCount int
}
func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
@@ -96,6 +99,7 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
ready := 0
degraded := 0
backoff := 0
readyExternalRegions := map[string]struct{}{}
for nodeID, connection := range connectionByNode {
entry, ok := entryByNode[nodeID]
if !ok || strings.TrimSpace(entry.Endpoint) == "" {
@@ -104,6 +108,10 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
switch connection.State {
case PeerConnectionReady:
ready++
region := strings.TrimSpace(entry.BestRegion)
if region != "" && (strings.TrimSpace(cfg.PreferredRegion) == "" || !strings.EqualFold(region, cfg.PreferredRegion)) {
readyExternalRegions[strings.ToLower(region)] = struct{}{}
}
case PeerConnectionRelayReady:
// Relay-ready peers remain valuable for control-plane reachability,
// but they do not satisfy the target for direct-ready transport paths.
@@ -125,6 +133,7 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
if mode == PeerRecoveryModeSteady {
limit = target
}
missingExternalRegions := missingPeerRecoveryExternalRegions(cfg.PeerCache, cfg.PreferredRegion, readyExternalRegions, target)
candidates := make([]peerRecoveryCandidateBuild, 0, len(cfg.PeerCache.Entries))
for _, entry := range cfg.PeerCache.Entries {
@@ -138,13 +147,14 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
if connection.State == PeerConnectionBackoff && connection.BackoffUntil.After(now) {
continue
}
reason, ok := peerRecoveryCandidateReason(mode, entry, connection)
reason, ok := peerRecoveryCandidateReason(mode, entry, connection, missingExternalRegions, cfg.PreferredRegion)
if !ok {
continue
}
candidate := PeerRecoveryCandidate{
NodeID: entry.NodeID,
Endpoint: strings.TrimSpace(entry.Endpoint),
Region: strings.TrimSpace(entry.BestRegion),
Warm: entry.Warm,
WarmReason: entry.WarmReason,
RecoverySeed: entry.RecoverySeed,
@@ -155,9 +165,12 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
LastLatencyMs: connection.LastLatencyMs,
BackoffUntil: connection.BackoffUntil,
Reason: reason,
Priority: peerRecoveryCandidatePriority(entry, connection, reason),
Priority: peerRecoveryCandidatePriority(entry, connection, reason, cfg.PreferredRegion),
}
candidates = append(candidates, peerRecoveryCandidateBuild{PeerRecoveryCandidate: candidate})
candidates = append(candidates, peerRecoveryCandidateBuild{
PeerRecoveryCandidate: candidate,
PublicIngressCount: entry.PublicIngressCount,
})
}
sort.SliceStable(candidates, func(i, j int) bool {
if candidates[i].Priority != candidates[j].Priority {
@@ -166,7 +179,7 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
return candidates[i].NodeID < candidates[j].NodeID
})
if len(candidates) > limit {
candidates = candidates[:limit]
candidates = trimPeerRecoveryCandidates(candidates, limit, cfg.PreferredRegion)
}
outCandidates := make([]PeerRecoveryCandidate, 0, len(candidates))
@@ -194,11 +207,143 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
}
}
func peerRecoveryCandidateReason(mode string, entry PeerCacheEntry, connection PeerConnectionState) (string, bool) {
func missingPeerRecoveryExternalRegions(snapshot PeerCacheSnapshot, preferredRegion string, readyExternalRegions map[string]struct{}, target int) map[string]struct{} {
preferredRegion = strings.TrimSpace(preferredRegion)
availableExternalRegions := map[string]struct{}{}
for _, entry := range snapshot.Entries {
region := strings.TrimSpace(entry.BestRegion)
if region == "" {
continue
}
if preferredRegion != "" && strings.EqualFold(region, preferredRegion) {
continue
}
availableExternalRegions[strings.ToLower(region)] = struct{}{}
}
if len(availableExternalRegions) == 0 {
return nil
}
desiredExternal := len(availableExternalRegions)
if desiredExternal > 2 {
desiredExternal = 2
}
if target > 0 && desiredExternal > target {
desiredExternal = target
}
if len(readyExternalRegions) >= desiredExternal {
return nil
}
missing := map[string]struct{}{}
for region := range availableExternalRegions {
if _, ok := readyExternalRegions[region]; ok {
continue
}
missing[region] = struct{}{}
}
if len(missing) == 0 {
return nil
}
return missing
}
func trimPeerRecoveryCandidates(candidates []peerRecoveryCandidateBuild, limit int, preferredRegion string) []peerRecoveryCandidateBuild {
if len(candidates) <= limit || limit <= 0 {
return candidates
}
preferredRegion = strings.TrimSpace(preferredRegion)
externalRegions := map[string]struct{}{}
for _, candidate := range candidates {
region := strings.TrimSpace(candidate.Region)
if region == "" || (preferredRegion != "" && strings.EqualFold(region, preferredRegion)) {
continue
}
externalRegions[strings.ToLower(region)] = struct{}{}
}
if len(externalRegions) < 2 {
return candidates[:limit]
}
selected := make([]peerRecoveryCandidateBuild, 0, limit)
selectedNodeIDs := map[string]struct{}{}
selectedRegions := map[string]struct{}{}
for _, candidate := range candidates {
if len(selected) >= limit {
break
}
region := strings.TrimSpace(candidate.Region)
if region == "" || (preferredRegion != "" && strings.EqualFold(region, preferredRegion)) {
continue
}
regionKey := strings.ToLower(region)
if _, exists := selectedRegions[regionKey]; exists {
continue
}
selected = append(selected, candidate)
selectedNodeIDs[candidate.NodeID] = struct{}{}
selectedRegions[regionKey] = struct{}{}
}
if len(selected) < limit && !selectedHasPublicIngress(selected) {
for _, candidate := range candidates {
if len(selected) >= limit {
break
}
if _, exists := selectedNodeIDs[candidate.NodeID]; exists {
continue
}
if candidatePublicIngressCount(candidate) <= 0 {
continue
}
selected = append(selected, candidate)
selectedNodeIDs[candidate.NodeID] = struct{}{}
break
}
}
for _, candidate := range candidates {
if len(selected) >= limit {
break
}
if _, exists := selectedNodeIDs[candidate.NodeID]; exists {
continue
}
selected = append(selected, candidate)
selectedNodeIDs[candidate.NodeID] = struct{}{}
}
if len(selected) > limit {
selected = selected[:limit]
}
return selected
}
func selectedHasPublicIngress(candidates []peerRecoveryCandidateBuild) bool {
for _, candidate := range candidates {
if candidatePublicIngressCount(candidate) > 0 {
return true
}
}
return false
}
func candidatePublicIngressCount(candidate peerRecoveryCandidateBuild) int {
return candidate.PublicIngressCount
}
func peerRecoveryCandidateReason(mode string, entry PeerCacheEntry, connection PeerConnectionState, missingExternalRegions map[string]struct{}, preferredRegion string) (string, bool) {
if mode == PeerRecoveryModeSteady {
if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady {
return "maintain_ready", true
}
region := strings.ToLower(strings.TrimSpace(entry.BestRegion))
if region != "" && len(missingExternalRegions) > 0 {
if _, ok := missingExternalRegions[region]; ok {
if preferredRegion == "" || !strings.EqualFold(strings.TrimSpace(entry.BestRegion), preferredRegion) {
if connection.State == PeerConnectionDegraded {
return "recover_external_area", true
}
if entry.Warm || entry.RecoverySeed || connection.State == PeerConnectionDisconnected || connection.State == PeerConnectionConnecting {
return "recover_external_area", true
}
}
}
}
return "", false
}
if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady {
@@ -216,7 +361,7 @@ func peerRecoveryCandidateReason(mode string, entry PeerCacheEntry, connection P
return "recover_peer", true
}
func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnectionState, reason string) int {
func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnectionState, reason string, preferredRegion string) int {
score := 0
if entry.Warm {
score += 1000
@@ -237,6 +382,17 @@ func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnecti
if entry.BestCandidateID != "" {
score += 150
}
if entry.PublicIngressCount > 0 {
score += entry.PublicIngressCount * 90
}
preferredRegion = strings.TrimSpace(preferredRegion)
entryRegion := strings.TrimSpace(entry.BestRegion)
switch {
case preferredRegion != "" && entryRegion != "" && !strings.EqualFold(entryRegion, preferredRegion):
score += 275
case preferredRegion != "" && entryRegion != "" && strings.EqualFold(entryRegion, preferredRegion):
score += 25
}
score += entry.BestCandidateScore / 10
switch connection.State {
case PeerConnectionReady, PeerConnectionRelayReady:
@@ -251,6 +407,8 @@ func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnecti
switch reason {
case "maintain_ready":
score += 500
case "recover_external_area":
score += 450
case "recover_degraded":
score += 300
case "recover_seed":
@@ -82,7 +82,7 @@ func TestPeerRecoveryPlanTreatsRelayReadyPeersAsRecoveryGap(t *testing.T) {
RendezvousLeaseID: "lease-1",
RelayNodeID: "node-r",
RelayEndpoint: "quic://relay:19443",
RelayControl: true,
RelayQUIC: true,
},
},
},
@@ -121,6 +121,129 @@ func TestPeerRecoveryPlanCapsTargetByConnectablePeers(t *testing.T) {
}
}
func TestPeerRecoveryPlanPrefersExternalRegionsWhenTrimmingReadyPeers(t *testing.T) {
now := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-home-a", Endpoint: "quic://node-home-a:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-home-b", Endpoint: "quic://node-home-b:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-usa", Endpoint: "quic://node-usa:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "usa"},
{NodeID: "node-ifcm", Endpoint: "quic://node-ifcm:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "ifcm"},
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-home-a", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-home-b", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-usa", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-ifcm", State: PeerConnectionReady, LastLatencyMs: 20},
}},
PreferredRegion: "home",
Now: now,
})
if len(plan.Candidates) != DefaultStablePeerTarget {
t.Fatalf("candidate count = %d, want %d", len(plan.Candidates), DefaultStablePeerTarget)
}
if !recoveryPlanHasCandidate(plan, "node-usa", "maintain_ready") || !recoveryPlanHasCandidate(plan, "node-ifcm", "maintain_ready") {
t.Fatalf("expected external-region peers to be retained: %+v", plan.Candidates)
}
}
func TestPeerRecoveryPlanPrefersPublicIngressAtSameRegion(t *testing.T) {
now := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-home-private-a", Endpoint: "quic://10.0.0.2:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-home-private-b", Endpoint: "quic://10.0.0.3:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-home-public", Endpoint: "quic://94.141.118.222:19199", Warm: true, WarmReason: "route_adjacent", BestRegion: "home", PublicIngressCount: 1},
{NodeID: "node-usa", Endpoint: "quic://195.123.240.88:19131", Warm: true, WarmReason: "route_adjacent", BestRegion: "usa", PublicIngressCount: 1},
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-home-private-a", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-home-private-b", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-home-public", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-usa", State: PeerConnectionReady, LastLatencyMs: 20},
}},
PreferredRegion: "home",
Now: now,
})
if len(plan.Candidates) != DefaultStablePeerTarget {
t.Fatalf("candidate count = %d, want %d", len(plan.Candidates), DefaultStablePeerTarget)
}
if !recoveryPlanHasCandidate(plan, "node-home-public", "maintain_ready") {
t.Fatalf("expected public-ingress home peer to be retained: %+v", plan.Candidates)
}
}
func TestPeerRecoveryPlanRetainsDistinctExternalRegionsWhenAvailable(t *testing.T) {
now := time.Date(2026, 5, 19, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-home-a", Endpoint: "quic://node-home-a:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-home-b", Endpoint: "quic://node-home-b:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-home-c", Endpoint: "quic://node-home-c:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-usa-a", Endpoint: "quic://node-usa-a:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "usa", PublicIngressCount: 1},
{NodeID: "node-usa-b", Endpoint: "quic://node-usa-b:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "usa", PublicIngressCount: 1},
{NodeID: "node-ifcm", Endpoint: "quic://node-ifcm:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "ifcm", PublicIngressCount: 1},
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-home-a", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-home-b", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-home-c", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-usa-a", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-usa-b", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-ifcm", State: PeerConnectionReady, LastLatencyMs: 20},
}},
PreferredRegion: "home",
Now: now,
})
if len(plan.Candidates) != DefaultStablePeerTarget {
t.Fatalf("candidate count = %d, want %d", len(plan.Candidates), DefaultStablePeerTarget)
}
if !recoveryPlanHasCandidate(plan, "node-usa-a", "maintain_ready") && !recoveryPlanHasCandidate(plan, "node-usa-b", "maintain_ready") {
t.Fatalf("expected at least one usa candidate to be retained: %+v", plan.Candidates)
}
if !recoveryPlanHasCandidate(plan, "node-ifcm", "maintain_ready") {
t.Fatalf("expected ifcm candidate to be retained for area diversity: %+v", plan.Candidates)
}
}
func TestPeerRecoveryPlanSteadyModeAddsMissingExternalAreaCandidate(t *testing.T) {
now := time.Date(2026, 5, 19, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-test-a", Endpoint: "quic://node-test-a:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "test"},
{NodeID: "node-test-b", Endpoint: "quic://node-test-b:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "test"},
{NodeID: "node-usa", Endpoint: "quic://node-usa:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "usa", PublicIngressCount: 1},
{NodeID: "node-home", Endpoint: "quic://node-home:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home", PublicIngressCount: 1},
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-test-a", State: PeerConnectionReady, LastLatencyMs: 10},
{NodeID: "node-test-b", State: PeerConnectionReady, LastLatencyMs: 10},
{NodeID: "node-usa", State: PeerConnectionReady, LastLatencyMs: 10},
{NodeID: "node-home", State: PeerConnectionDegraded, LastLatencyMs: 20},
}},
PreferredRegion: "test",
Now: now,
})
if len(plan.Candidates) != DefaultStablePeerTarget {
t.Fatalf("candidate count = %d, want %d", len(plan.Candidates), DefaultStablePeerTarget)
}
if !recoveryPlanHasCandidate(plan, "node-home", "recover_external_area") {
t.Fatalf("expected missing external area candidate to be retained: %+v", plan.Candidates)
}
}
func recoveryPlanPeer(nodeID string, warm bool, recoverySeed bool, warmReason string) PeerCacheEntry {
return PeerCacheEntry{
NodeID: nodeID,
@@ -280,6 +280,9 @@ func (t *QUICProductionForwardTransport) sendProductionOnSession(ctx context.Con
return fabricproto.Frame{}, 0, ErrForwardPeerUnavailable
}
if err != nil {
if frame, ok := drainProductionResponseFrame(session, sequence); ok {
return frame, time.Since(started).Milliseconds(), nil
}
return fabricproto.Frame{}, 0, err
}
case frame, ok := <-session.Frames():
@@ -294,6 +297,25 @@ func (t *QUICProductionForwardTransport) sendProductionOnSession(ctx context.Con
}
}
func drainProductionResponseFrame(session FabricTransportSession, sequence uint64) (fabricproto.Frame, bool) {
if session == nil {
return fabricproto.Frame{}, false
}
for {
select {
case frame, ok := <-session.Frames():
if !ok {
return fabricproto.Frame{}, false
}
if frame.Type == fabricproto.FrameData && frame.StreamID == ProductionForwardQUICStreamID && frame.Sequence == sequence {
return frame, true
}
default:
return fabricproto.Frame{}, false
}
}
}
func decodeQUICProductionForwardResponse(payload []byte) (ProductionForwardResult, error) {
var response quicProductionForwardResponse
if err := json.Unmarshal(payload, &response); err != nil {
@@ -283,12 +283,28 @@ func (r *FabricRegistry) ResolveService(req FabricRegistryResolveRequest) Fabric
return FabricRegistryResolvedService{Found: false, Reason: "service_required"}
}
scopeOrder := fabricRegistryScopeResolutionOrder(req.Scope, req.OrganizationID)
if resolved := r.resolveServiceFromRecords(req, service, scopeOrder, false); resolved.Found || resolved.Reason == "no_usable_endpoints" {
return resolved
}
if resolved := r.resolveServiceFromRecords(req, service, scopeOrder, true); resolved.Found || resolved.Reason == "no_usable_endpoints" {
return resolved
}
return FabricRegistryResolvedService{Found: false, Service: service, Reason: "no_active_record"}
}
func (r *FabricRegistry) resolveServiceFromRecords(req FabricRegistryResolveRequest, service string, scopeOrder []string, candidateOnly bool) FabricRegistryResolvedService {
for _, scope := range scopeOrder {
organizationID := strings.TrimSpace(req.OrganizationID)
if scope != FabricRegistryScopeOrganization {
organizationID = ""
}
record, ok := r.Active(req.ClusterID, service, scope, organizationID, req.Now)
var record FabricRegistryGossipRecord
var ok bool
if candidateOnly {
record, ok = r.Candidate(req.ClusterID, service, scope, organizationID, req.Now)
} else {
record, ok = r.Active(req.ClusterID, service, scope, organizationID, req.Now)
}
if !ok {
continue
}
@@ -306,9 +322,28 @@ func (r *FabricRegistry) ResolveService(req FabricRegistryResolveRequest) Fabric
RecordEpoch: record.Epoch,
RecordHash: hex.EncodeToString(sum[:]),
Endpoints: endpoints,
Reason: fabricRegistryResolveReason(candidateOnly),
}
}
return FabricRegistryResolvedService{Found: false, Service: service, Reason: "no_active_record"}
return FabricRegistryResolvedService{Found: false, Service: service}
}
func (r *FabricRegistry) Candidate(clusterID, service, scope, organizationID string, now time.Time) (FabricRegistryGossipRecord, bool) {
if r == nil {
return FabricRegistryGossipRecord{}, false
}
entry, ok := r.candidates[fabricRegistryKey(clusterID, service, scope, organizationID)]
if !ok || entry.State != FabricRegistryCandidate || !entry.Record.ExpiresAt.After(registryNow(now)) {
return FabricRegistryGossipRecord{}, false
}
return entry.Record, true
}
func fabricRegistryResolveReason(candidateOnly bool) string {
if candidateOnly {
return "candidate_record_pending_live_verification"
}
return ""
}
func (r *FabricRegistry) Snapshot(now time.Time) FabricRegistrySnapshot {
@@ -507,7 +542,7 @@ func validateFabricRegistryGossipRecord(record FabricRegistryGossipRecord, polic
if strings.TrimSpace(endpoint.EndpointID) == "" || strings.TrimSpace(endpoint.Address) == "" || strings.TrimSpace(endpoint.Transport) == "" {
return fmt.Errorf("fabric registry gossip record contains invalid endpoint")
}
if !isQUICOnlyCandidateTransport(endpoint.Transport) || hasLegacyEndpointScheme(endpoint.Address) {
if !isQUICOnlyCandidateTransport(endpoint.Transport) || hasUnsupportedEndpointScheme(endpoint.Address) {
return fmt.Errorf("fabric registry gossip endpoint must be QUIC-only")
}
if len(endpoint.Metadata) > 0 && !json.Valid(endpoint.Metadata) {
@@ -605,7 +640,7 @@ func selectFabricRegistryEndpoints(endpoints []FabricRegistryEndpoint, preferred
preferredRegion = strings.TrimSpace(preferredRegion)
out := make([]FabricRegistryEndpoint, 0, len(endpoints))
for _, endpoint := range endpoints {
if strings.TrimSpace(endpoint.Address) == "" || !isQUICOnlyCandidateTransport(endpoint.Transport) || hasLegacyEndpointScheme(endpoint.Address) {
if strings.TrimSpace(endpoint.Address) == "" || !isQUICOnlyCandidateTransport(endpoint.Transport) || hasUnsupportedEndpointScheme(endpoint.Address) {
continue
}
out = append(out, endpoint)
@@ -636,16 +671,10 @@ func probeFabricRegistryEndpoint(ctx context.Context, transport FabricTransport,
if timeout <= 0 {
timeout = 2 * time.Second
}
target := FabricTransportTarget{
EndpointID: endpoint.EndpointID,
PeerID: endpoint.EndpointID,
Endpoint: endpoint.Address,
Transport: endpoint.Transport,
PeerCertSHA256: endpoint.PeerCertSHA256,
Timeout: timeout,
InboundBuffer: 2,
ErrorBuffer: 2,
}
target := FabricTransportTargetFromRegistryEndpoint(endpoint)
target.Timeout = timeout
target.InboundBuffer = 2
target.ErrorBuffer = 2
startedAt := time.Now()
session, err := transport.Connect(ctx, target)
if err != nil {
@@ -45,7 +45,7 @@ func TestFabricRegistryGossipRecordRequiresTrustedSignature(t *testing.T) {
}
}
func TestFabricRegistryRejectsLegacyEndpointAndExpiredRecord(t *testing.T) {
func TestFabricRegistryRejectsDisallowedEndpointAndExpiredRecord(t *testing.T) {
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
@@ -65,7 +65,7 @@ func TestFabricRegistryRejectsLegacyEndpointAndExpiredRecord(t *testing.T) {
},
Now: now,
}); err == nil {
t.Fatal("legacy HTTP endpoint was accepted")
t.Fatal("compat HTTP endpoint was accepted")
}
expired := testFabricRegistryGossipRecord(now.Add(-2*time.Hour), 11)
expired.ExpiresAt = now.Add(-time.Minute)
@@ -523,7 +523,7 @@ func (s *RemoteWorkspaceFrameProbeSink) AcceptRemoteWorkspaceFrameBatchProbe(_ c
AckedFrames: acceptedFrames,
Backpressure: false,
DropPolicy: "drop_droppable_overflow_ack_accepted",
DeliverySequence: s.sequence,
DeliverySequence: uint64(s.sequence),
DeliveredAt: now.Format(time.RFC3339Nano),
}
s.last = receipt
@@ -695,6 +695,24 @@ func isValidRemoteWorkspaceAdapterSessionID(adapterSessionID string) bool {
return true
}
func isValidRemoteWorkspaceAdapterMailboxConsumerID(consumerID string) bool {
consumerID = strings.TrimSpace(consumerID)
if consumerID == "" || len(consumerID) > 128 {
return false
}
for _, ch := range consumerID {
switch {
case ch >= 'a' && ch <= 'z':
case ch >= 'A' && ch <= 'Z':
case ch >= '0' && ch <= '9':
case ch == '-', ch == '_', ch == '.', ch == ':':
default:
return false
}
}
return true
}
func actionToAdapterSessionState(action string) string {
switch action {
case "expire":
@@ -106,7 +106,7 @@ func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error {
if strings.TrimSpace(nodeID) == "" || strings.TrimSpace(endpoint) == "" {
return fmt.Errorf("scoped synthetic mesh config contains empty peer endpoint")
}
if hasLegacyEndpointScheme(endpoint) {
if hasUnsupportedEndpointScheme(endpoint) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC peer endpoint")
}
}
@@ -124,7 +124,7 @@ func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error {
strings.TrimSpace(candidate.ConnectivityMode) == "" {
return fmt.Errorf("scoped synthetic mesh config contains invalid peer endpoint candidate")
}
if !isQUICOnlyCandidateTransport(candidate.Transport) || hasLegacyEndpointScheme(candidate.Address) {
if !isQUICOnlyCandidateTransport(candidate.Transport) || hasUnsupportedEndpointScheme(candidate.Address) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC peer endpoint candidate")
}
}
@@ -185,12 +185,12 @@ func validatePeerDirectory(entries []PeerDirectoryEntry, localNodeID string) err
return nil
}
func hasLegacyEndpointScheme(endpoint string) bool {
func hasUnsupportedEndpointScheme(endpoint string) bool {
endpoint = strings.ToLower(strings.TrimSpace(endpoint))
return strings.HasPrefix(endpoint, "http://") ||
strings.HasPrefix(endpoint, "https://") ||
strings.HasPrefix(endpoint, "ws://") ||
strings.HasPrefix(endpoint, "wss://")
if endpoint == "" || !strings.Contains(endpoint, "://") {
return false
}
return !strings.HasPrefix(endpoint, "quic://")
}
func validateRecoverySeeds(seeds []PeerRecoverySeed) error {
@@ -205,7 +205,7 @@ func validateRecoverySeeds(seeds []PeerRecoverySeed) error {
strings.TrimSpace(seed.Transport) == "" {
return fmt.Errorf("scoped synthetic mesh config contains invalid recovery seed")
}
if !isQUICOnlyCandidateTransport(seed.Transport) || hasLegacyEndpointScheme(seed.Endpoint) {
if !isQUICOnlyCandidateTransport(seed.Transport) || hasUnsupportedEndpointScheme(seed.Endpoint) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC recovery seed")
}
if _, duplicate := seen[key]; duplicate {
@@ -241,7 +241,7 @@ func validateRendezvousLeases(leases []PeerRendezvousLease, routes []SyntheticRo
(len(lease.Metadata) > 0 && !json.Valid(lease.Metadata)) {
return fmt.Errorf("scoped synthetic mesh config contains invalid rendezvous lease")
}
if !isQUICOnlyCandidateTransport(lease.Transport) || hasLegacyEndpointScheme(lease.RelayEndpoint) {
if !isQUICOnlyCandidateTransport(lease.Transport) || hasUnsupportedEndpointScheme(lease.RelayEndpoint) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC rendezvous lease")
}
if _, duplicate := seen[lease.LeaseID]; duplicate {
@@ -174,7 +174,7 @@ func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointCandidate(t *testing
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpoint(t *testing.T) {
func TestLoadScopedSyntheticConfigRejectsDisallowedPeerEndpoint(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
@@ -189,7 +189,7 @@ func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpoint(t *testing.T) {
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpointCandidateTransport(t *testing.T) {
func TestLoadScopedSyntheticConfigRejectsDisallowedPeerEndpointCandidateTransport(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
@@ -215,7 +215,7 @@ func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpointCandidateTransport(t
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpointCandidateScheme(t *testing.T) {
func TestLoadScopedSyntheticConfigRejectsDisallowedPeerEndpointCandidateScheme(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
@@ -295,7 +295,7 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRecoverySeed(t *testing.T) {
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyRecoverySeed(t *testing.T) {
func TestLoadScopedSyntheticConfigRejectsDisallowedRecoverySeed(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
@@ -337,7 +337,7 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) {
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyRendezvousLease(t *testing.T) {
func TestLoadScopedSyntheticConfigRejectsDisallowedRendezvousLease(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17z12.synthetic.v1",
ClusterID: "cluster-1",
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -12,6 +12,21 @@ import (
type VPNPacketBatchPayload struct {
SchemaVersion string `json:"schema_version"`
VPNConnectionID string `json:"vpn_connection_id"`
TunnelID string `json:"tunnel_id,omitempty"`
PoolID string `json:"pool_id,omitempty"`
ServiceID string `json:"service_id,omitempty"`
LocalServiceID string `json:"local_service_id,omitempty"`
RemoteServiceID string `json:"remote_service_id,omitempty"`
ServiceKind string `json:"service_kind,omitempty"`
ServiceClass string `json:"service_class,omitempty"`
ServiceRole string `json:"service_role,omitempty"`
RouteLeaseID string `json:"route_lease_id,omitempty"`
RouteGeneration string `json:"route_generation,omitempty"`
DataPlane string `json:"data_plane,omitempty"`
TransportOwner string `json:"transport_owner,omitempty"`
RouteVisibility string `json:"route_visibility,omitempty"`
TrafficClasses []string `json:"traffic_classes,omitempty"`
StreamShards int `json:"stream_shards,omitempty"`
Direction string `json:"direction"`
Packets [][]byte `json:"packets"`
SentAt time.Time `json:"sent_at"`
@@ -70,7 +70,7 @@ func (s StubSupervisor) applyOne(workload client.DesiredWorkload) client.Workloa
StatusPayload: payload,
}
}
if serviceType == "core-mesh" || serviceType == "mesh-listener" {
if serviceType == "core-mesh" || serviceType == "fabric-listener" {
payload["reason"] = "builtin_node_agent_service_ready"
payload["execution_mode"] = "builtin"
payload["traffic"] = serviceTrafficMode(serviceType)
@@ -143,7 +143,7 @@ func (s StubSupervisor) applyOne(workload client.DesiredWorkload) client.Workloa
StatusPayload: payload,
}
}
if (serviceType == "vpn-exit" || serviceType == "ipv4-egress" || serviceType == "vpn-client") && runtimeMode == "native" {
if (serviceType == "vpn-exit" || serviceType == "ipv4-egress" || serviceType == "vpn-client" || serviceType == "ipv4-ingress") && runtimeMode == "native" {
for key, value := range vpnFabricOnlyContract(serviceType, workload.Config) {
payload[key] = value
}
@@ -151,7 +151,7 @@ func (s StubSupervisor) applyOne(workload client.DesiredWorkload) client.Workloa
payload["fabric_transport"] = "quic_only"
payload["fabric_service_channel_required"] = true
payload["backend_relay_fallback"] = false
payload["legacy_protocol_compatibility"] = false
payload["compat_protocol_compatibility"] = false
payload["traffic"] = "fabric_service_channel_only"
return client.WorkloadStatusRequest{
ReportedState: "running",
@@ -202,8 +202,8 @@ func (s StubSupervisor) applyOne(workload client.DesiredWorkload) client.Workloa
}
func vpnFabricOnlyContract(serviceType string, config map[string]any) map[string]any {
role := "vpn-client"
reason := "vpn_client_node_contract_ready"
role := "ipv4-ingress"
reason := "ipv4_ingress_node_contract_ready"
serviceClass := "vpn_packets"
internetEgress := false
if serviceType == "vpn-exit" || serviceType == "ipv4-egress" {
@@ -222,7 +222,12 @@ func vpnFabricOnlyContract(serviceType string, config map[string]any) map[string
"allowed_cidrs": stringSliceConfig(config, "allowed_cidrs"),
"dns_servers": stringSliceConfig(config, "dns_servers"),
"client_policy_source": stringConfig(config, "client_policy_source", "fabric_access_policy"),
"android_node_supported": serviceType == "vpn-client",
"legacy_role_alias": "vpn-client",
"node_core": "same_fabric_core_all_platforms",
"platform_adapter_scope": "local_packet_io_only",
"android_node_supported": serviceType == "vpn-client" || serviceType == "ipv4-ingress",
"linux_node_supported": serviceType == "vpn-client" || serviceType == "ipv4-ingress",
"windows_node_supported": serviceType == "vpn-client" || serviceType == "ipv4-ingress",
"ipv4_exit_supported": internetEgress,
"fabric_service_channel_required": true,
"packet_runtime_status": "fabric_channel_binding_pending_runtime",
@@ -237,7 +242,7 @@ func vpnServiceBindingContract(serviceType string, config map[string]any) map[st
"type": "ipv4_egress",
"accepts_service_class": "vpn_packets",
"accepts_from_fabric_only": true,
"legacy_protocol_listener": false,
"compat_protocol_listener": false,
"exit_pool_id": stringConfig(config, "pool_id", ""),
"region": stringConfig(config, "region", ""),
"allowed_cidrs": stringSliceConfig(config, "allowed_cidrs"),
@@ -248,7 +253,7 @@ func vpnServiceBindingContract(serviceType string, config map[string]any) map[st
}
return map[string]any{
"type": "local_ipv4_ingress",
"accepts_from": []string{"android_vpnservice_tun", "linux_tun", "host_service_port"},
"accepts_from": []string{"android_vpnservice_tun", "linux_tun", "windows_wintun", "host_service_port"},
"service_class": "vpn_packets",
"exit_selection": "pool",
"preferred_exit_pool_id": stringConfig(config, "exit_pool_id", ""),
@@ -256,8 +261,10 @@ func vpnServiceBindingContract(serviceType string, config map[string]any) map[st
"listen_udp_ports": intSliceConfig(config, "listen_udp_ports"),
"tun_required": true,
"route_authority": "fabric_farm",
"legacy_protocol_listener": false,
"compat_protocol_listener": false,
"requires_fabric_node_runtime": true,
"traffic_visibility": "opaque_ipv4_packets",
"flow_distribution": "opaque_packet_hash_shards",
}
}
@@ -266,12 +273,10 @@ func webIngressListenerConfig(serviceType string, config map[string]any) webingr
RuntimeConfig: webingress.RuntimeConfig{
ServiceType: serviceType,
Scope: stringConfig(config, "scope", ""),
ServiceClasses: stringSliceConfig(config, "service_classes"),
ServiceClasses: webIngressServiceClasses(serviceType, config),
TLSMode: stringConfig(config, "tls_mode", "terminate"),
HTTPPort: intConfig(config, "listen_http_port", 80),
HTTPSPort: intConfig(config, "listen_https_port", 443),
},
HTTPAddr: stringConfig(config, "listen_http_addr", ":80"),
HTTPSAddr: stringConfig(config, "listen_https_addr", ":443"),
TLSCertFile: stringConfig(config, "tls_cert_file", ""),
TLSKeyFile: stringConfig(config, "tls_key_file", ""),
@@ -279,17 +284,13 @@ func webIngressListenerConfig(serviceType string, config map[string]any) webingr
}
func (s StubSupervisor) webIngressContract(serviceType string, config map[string]any) map[string]any {
httpPort := intConfig(config, "listen_http_port", 80)
httpsPort := intConfig(config, "listen_https_port", 443)
tlsMode := strings.TrimSpace(stringConfig(config, "tls_mode", "terminate"))
serviceClasses := stringSliceConfig(config, "service_classes")
serviceClasses := webIngressServiceClasses(serviceType, config)
scope := strings.TrimSpace(stringConfig(config, "scope", ""))
realListenerRequested := boolConfig(config, "real_listener_enabled")
allowedClasses := webIngressAllowedServiceClasses(serviceType)
missing := []string{}
if httpPort != 80 {
missing = append(missing, "listen_http_port_must_be_80")
}
if httpsPort != 443 {
missing = append(missing, "listen_https_port_must_be_443")
}
@@ -315,14 +316,13 @@ func (s StubSupervisor) webIngressContract(serviceType string, config map[string
"authority_service": false,
"fabric_transport": "quic_only",
"http_between_fabric_nodes": false,
"listen_http_port": httpPort,
"listen_https_port": httpsPort,
"tls_mode": tlsMode,
"scope": scope,
"service_classes": serviceClasses,
"allowed_service_classes": allowedClasses,
"fabric_service_channel_required": true,
"runtime_roles_required": webIngressRuntimeRoles(serviceClasses),
"runtime_fabric_functions": webIngressFabricFunctions(serviceType, serviceClasses),
"payload_forwarding": "contract_only",
"real_listener_requested": realListenerRequested,
"real_listener_runtime_enabled": s.WebIngressRuntimeEnabled,
@@ -346,26 +346,41 @@ func (s StubSupervisor) webIngressContract(serviceType string, config map[string
func webIngressAllowedServiceClasses(serviceType string) []string {
if serviceType == "admin-ingress" {
return []string{"platform_admin", "cluster_admin"}
return []string{"admin-ingress"}
}
return []string{"organization_portal", "user_portal"}
return []string{"public-ingress"}
}
func webIngressRuntimeRoles(serviceClasses []string) []string {
roles := []string{}
for _, serviceClass := range serviceClasses {
func webIngressServiceClasses(serviceType string, config map[string]any) []string {
raw := stringSliceConfig(config, "service_classes")
if len(raw) == 0 {
return webIngressAllowedServiceClasses(serviceType)
}
out := []string{}
for _, serviceClass := range raw {
serviceClass = strings.TrimSpace(serviceClass)
switch serviceClass {
case "platform_admin":
roles = append(roles, "global-admin-runtime", "identity-runtime", "policy-authority", "audit-sink")
case "cluster_admin":
roles = append(roles, "cluster-admin-runtime", "identity-runtime", "policy-authority", "audit-sink")
case "organization_portal":
roles = append(roles, "organization-portal-runtime", "identity-runtime", "policy-authority", "audit-sink")
case "user_portal":
roles = append(roles, "user-portal-runtime", "identity-runtime", "policy-authority", "audit-sink")
case "admin-ingress", "public-ingress":
out = append(out, serviceClass)
}
}
return dedupeStrings(roles)
if len(out) == 0 {
return webIngressAllowedServiceClasses(serviceType)
}
return dedupeStrings(out)
}
func webIngressFabricFunctions(serviceType string, serviceClasses []string) []string {
functions := []string{serviceType}
for _, serviceClass := range serviceClasses {
switch serviceClass {
case "admin-ingress":
functions = append(functions, "admin-ingress")
case "public-ingress":
functions = append(functions, "public-ingress")
}
}
return dedupeStrings(functions)
}
func boolConfig(values map[string]any, key string) bool {
@@ -732,7 +747,7 @@ func serviceTrafficMode(serviceType string) string {
switch serviceType {
case "core-mesh":
return "fabric_control"
case "mesh-listener":
case "fabric-listener":
return "entry_listener"
default:
return "unknown"
@@ -2,7 +2,16 @@ package supervisor
import (
"context"
"crypto/rand"
"crypto/rsa"
"crypto/x509"
"crypto/x509/pkix"
"encoding/pem"
"math/big"
"os"
"path/filepath"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/webingress"
@@ -56,7 +65,7 @@ func TestStubSupervisorRunsInternalSyntheticEchoWorkload(t *testing.T) {
func TestStubSupervisorReportsBuiltinFabricServicesRunning(t *testing.T) {
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
{ServiceType: "core-mesh", DesiredState: "enabled", RuntimeMode: "container"},
{ServiceType: "mesh-listener", DesiredState: "enabled", RuntimeMode: "container"},
{ServiceType: "fabric-listener", DesiredState: "enabled", RuntimeMode: "container"},
})
if err != nil {
t.Fatalf("apply desired workload: %v", err)
@@ -88,7 +97,7 @@ func TestStubSupervisorReportsVPNFabricOnlyContractsRunning(t *testing.T) {
},
},
{
ServiceType: "vpn-client",
ServiceType: "ipv4-ingress",
DesiredState: "enabled",
RuntimeMode: "native",
Config: map[string]any{
@@ -117,14 +126,18 @@ func TestStubSupervisorReportsVPNFabricOnlyContractsRunning(t *testing.T) {
if status.StatusPayload["backend_relay_fallback"] != false {
t.Fatalf("backend_relay_fallback = %v", status.StatusPayload["backend_relay_fallback"])
}
if status.StatusPayload["legacy_protocol_compatibility"] != false {
t.Fatalf("legacy_protocol_compatibility = %v", status.StatusPayload["legacy_protocol_compatibility"])
if status.StatusPayload["compat_protocol_compatibility"] != false {
t.Fatalf("compat_protocol_compatibility = %v", status.StatusPayload["compat_protocol_compatibility"])
}
}
if statuses[0].StatusPayload["role"] != "ipv4-egress" || statuses[0].StatusPayload["internet_egress"] != true {
t.Fatalf("ipv4 egress payload = %#v", statuses[0].StatusPayload)
}
if statuses[1].StatusPayload["role"] != "vpn-client" || statuses[1].StatusPayload["android_node_supported"] != true {
if statuses[1].StatusPayload["role"] != "ipv4-ingress" ||
statuses[1].StatusPayload["legacy_role_alias"] != "vpn-client" ||
statuses[1].StatusPayload["android_node_supported"] != true ||
statuses[1].StatusPayload["linux_node_supported"] != true ||
statuses[1].StatusPayload["windows_node_supported"] != true {
t.Fatalf("vpn client payload = %#v", statuses[1].StatusPayload)
}
exitBinding := statuses[0].StatusPayload["service_binding"].(map[string]any)
@@ -132,9 +145,12 @@ func TestStubSupervisorReportsVPNFabricOnlyContractsRunning(t *testing.T) {
t.Fatalf("ipv4 egress binding = %#v", exitBinding)
}
clientBinding := statuses[1].StatusPayload["service_binding"].(map[string]any)
if clientBinding["type"] != "local_ipv4_ingress" || clientBinding["preferred_exit_pool_id"] != "us-los-angeles-ipv4" || clientBinding["legacy_protocol_listener"] != false {
if clientBinding["type"] != "local_ipv4_ingress" || clientBinding["preferred_exit_pool_id"] != "us-los-angeles-ipv4" || clientBinding["compat_protocol_listener"] != false {
t.Fatalf("vpn client binding = %#v", clientBinding)
}
if clientBinding["traffic_visibility"] != "opaque_ipv4_packets" || clientBinding["flow_distribution"] != "opaque_packet_hash_shards" {
t.Fatalf("ipv4 ingress binding should be opaque: %#v", clientBinding)
}
if got := clientBinding["listen_tcp_ports"].([]int); len(got) != 2 || got[0] != 443 || got[1] != 8443 {
t.Fatalf("listen_tcp_ports = %#v", got)
}
@@ -150,11 +166,10 @@ func TestStubSupervisorReportsWebIngressContractReady(t *testing.T) {
DesiredState: "enabled",
RuntimeMode: "native",
Config: map[string]any{
"listen_http_port": 80,
"listen_https_port": 443,
"tls_mode": "terminate",
"scope": "platform",
"service_classes": []any{"platform_admin", "cluster_admin"},
"service_classes": []any{"admin-ingress", "admin-ingress"},
},
},
})
@@ -175,9 +190,9 @@ func TestStubSupervisorReportsWebIngressContractReady(t *testing.T) {
payload["ports_opened_by_stub"] != false {
t.Fatalf("unexpected payload: %#v", payload)
}
roles, ok := payload["runtime_roles_required"].([]string)
if !ok || !containsString(roles, "global-admin-runtime") || !containsString(roles, "policy-authority") {
t.Fatalf("runtime roles = %#v", payload["runtime_roles_required"])
functions, ok := payload["runtime_fabric_functions"].([]string)
if !ok || !containsString(functions, "admin-ingress") {
t.Fatalf("runtime fabric functions = %#v", payload["runtime_fabric_functions"])
}
}
@@ -188,11 +203,10 @@ func TestStubSupervisorBlocksWebIngressRealListenerWithoutRuntimeGate(t *testing
DesiredState: "enabled",
RuntimeMode: "native",
Config: map[string]any{
"listen_http_port": 80,
"listen_https_port": 443,
"tls_mode": "terminate",
"scope": "platform",
"service_classes": []any{"platform_admin"},
"service_classes": []any{"admin-ingress"},
"real_listener_enabled": true,
},
},
@@ -220,11 +234,10 @@ func TestStubSupervisorAllowsWebIngressRealListenerGateButDoesNotOpenPorts(t *te
DesiredState: "enabled",
RuntimeMode: "native",
Config: map[string]any{
"listen_http_port": 80,
"listen_https_port": 443,
"tls_mode": "terminate",
"scope": "platform",
"service_classes": []any{"platform_admin"},
"service_classes": []any{"admin-ingress"},
"real_listener_enabled": true,
},
},
@@ -245,6 +258,8 @@ func TestStubSupervisorAllowsWebIngressRealListenerGateButDoesNotOpenPorts(t *te
}
func TestStubSupervisorStartsWebIngressManagerWhenRealListenerAllowed(t *testing.T) {
dir := t.TempDir()
certFile, keyFile := writeSelfSignedCert(t, dir)
manager := webingress.NewManager()
statuses, err := (StubSupervisor{Version: "test", WebIngressRuntimeEnabled: true, WebIngressManager: manager}).Apply(context.Background(), []client.DesiredWorkload{
{
@@ -252,13 +267,13 @@ func TestStubSupervisorStartsWebIngressManagerWhenRealListenerAllowed(t *testing
DesiredState: "enabled",
RuntimeMode: "native",
Config: map[string]any{
"listen_http_port": 80,
"listen_https_port": 443,
"listen_http_addr": "127.0.0.1:0",
"listen_https_addr": "127.0.0.1:0",
"tls_mode": "terminate",
"tls_cert_file": certFile,
"tls_key_file": keyFile,
"scope": "platform",
"service_classes": []any{"platform_admin"},
"service_classes": []any{"admin-ingress"},
"real_listener_enabled": true,
},
},
@@ -266,7 +281,7 @@ func TestStubSupervisorStartsWebIngressManagerWhenRealListenerAllowed(t *testing
if err != nil {
t.Fatalf("apply desired workload: %v", err)
}
if statuses[0].ReportedState != "degraded" {
if statuses[0].ReportedState != "running" {
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
}
payload := statuses[0].StatusPayload
@@ -274,15 +289,44 @@ func TestStubSupervisorStartsWebIngressManagerWhenRealListenerAllowed(t *testing
if !ok {
t.Fatalf("listener_status = %#v", payload["listener_status"])
}
if !listenerStatus.HTTPRunning || listenerStatus.HTTPSRunning || listenerStatus.HTTPAddr == "" {
if !listenerStatus.HTTPSRunning || listenerStatus.HTTPSAddr == "" {
t.Fatalf("listener status = %+v", listenerStatus)
}
if payload["reason"] != "web_ingress_listener_partial" || payload["ports_opened_by_runtime"] != true || payload["ports_opened_by_stub"] != false {
if payload["reason"] != "web_ingress_contract_ready" || payload["ports_opened_by_runtime"] != true || payload["ports_opened_by_stub"] != false {
t.Fatalf("payload = %#v", payload)
}
_ = manager.Stop(context.Background())
}
func writeSelfSignedCert(t *testing.T, dir string) (string, string) {
t.Helper()
key, err := rsa.GenerateKey(rand.Reader, 2048)
if err != nil {
t.Fatalf("generate key: %v", err)
}
template := x509.Certificate{
SerialNumber: big.NewInt(1),
Subject: pkix.Name{CommonName: "localhost"},
NotBefore: time.Now().Add(-time.Hour),
NotAfter: time.Now().Add(time.Hour),
KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature,
DNSNames: []string{"localhost"},
}
der, err := x509.CreateCertificate(rand.Reader, &template, &template, &key.PublicKey, key)
if err != nil {
t.Fatalf("create cert: %v", err)
}
certFile := filepath.Join(dir, "cert.pem")
keyFile := filepath.Join(dir, "key.pem")
if err := os.WriteFile(certFile, pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der}), 0o600); err != nil {
t.Fatalf("write cert: %v", err)
}
if err := os.WriteFile(keyFile, pem.EncodeToMemory(&pem.Block{Type: "RSA PRIVATE KEY", Bytes: x509.MarshalPKCS1PrivateKey(key)}), 0o600); err != nil {
t.Fatalf("write key: %v", err)
}
return certFile, keyFile
}
func TestStubSupervisorBlocksInvalidWebIngressContract(t *testing.T) {
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
{
@@ -290,10 +334,9 @@ func TestStubSupervisorBlocksInvalidWebIngressContract(t *testing.T) {
DesiredState: "enabled",
RuntimeMode: "native",
Config: map[string]any{
"listen_http_port": 8080,
"listen_https_port": 443,
"listen_https_port": 444,
"scope": "organization",
"service_classes": []any{"platform_admin"},
"service_classes": []any{"admin-ingress"},
},
},
})
@@ -308,7 +351,7 @@ func TestStubSupervisorBlocksInvalidWebIngressContract(t *testing.T) {
t.Fatalf("unexpected payload: %#v", payload)
}
missing, ok := payload["missing_checks"].([]string)
if !ok || !containsString(missing, "listen_http_port_must_be_80") || !containsString(missing, "service_class_not_allowed:platform_admin") {
if !ok || !containsString(missing, "listen_https_port_must_be_443") || !containsString(missing, "service_class_not_allowed:admin-ingress") {
t.Fatalf("missing checks = %#v", payload["missing_checks"])
}
}
@@ -3,6 +3,7 @@ package vpnruntime
import (
"context"
"encoding/binary"
"encoding/json"
"errors"
"fmt"
"time"
@@ -12,10 +13,11 @@ import (
)
const (
fabricVPNPacketPayloadMagic uint32 = 0x52565042 // RVPB
fabricVPNPacketPayloadVersion uint8 = 1
fabricVPNPacketPayloadHeader = 24
fabricVPNPacketMaxPacketCount = 2048
fabricVPNPacketPayloadMagic uint32 = 0x52565042 // RVPB
fabricVPNPacketPayloadVersion uint8 = 2
fabricVPNPacketPayloadHeader = 24
fabricVPNPacketMaxPacketCount = 2048
fabricVPNPacketMaxMetadataBytes = 64 * 1024
fabricVPNPacketDirectionClientToGateway uint8 = 1
fabricVPNPacketDirectionGatewayToClient uint8 = 2
@@ -32,6 +34,7 @@ type FabricVPNPacketFrameInput struct {
VPNConnectionID string
Direction string
TrafficClass string
ServiceTunnel FabricServiceTunnel
Packets [][]byte
Now time.Time
}
@@ -60,6 +63,26 @@ func NewFabricVPNPacketDataFrame(input FabricVPNPacketFrameInput) (fabricproto.F
}, nil
}
func NewFabricVPNSessionHelloFrame(input FabricVPNPacketFrameInput) (fabricproto.Frame, error) {
if input.StreamID == 0 {
return fabricproto.Frame{}, fmt.Errorf("%w: missing stream id", ErrFabricVPNPacketFrameInvalid)
}
if input.VPNConnectionID == "" || input.Direction == "" {
return fabricproto.Frame{}, fmt.Errorf("%w: missing vpn identity", ErrFabricVPNPacketFrameInvalid)
}
payload, err := encodeFabricVPNPacketPayload(input, nil)
if err != nil {
return fabricproto.Frame{}, err
}
return fabricproto.Frame{
Type: fabricproto.FrameData,
TrafficClass: fabricFrameTrafficClass(input.TrafficClass, nil),
StreamID: input.StreamID,
Sequence: input.Sequence,
Payload: payload,
}, nil
}
func DecodeFabricVPNPacketDataFrame(frame fabricproto.Frame) (mesh.VPNPacketBatchPayload, error) {
if frame.Type != fabricproto.FrameData || frame.StreamID == 0 {
return mesh.VPNPacketBatchPayload{}, fmt.Errorf("%w: expected DATA stream frame", ErrFabricVPNPacketFrameInvalid)
@@ -94,11 +117,19 @@ func encodeFabricVPNPacketPayload(input FabricVPNPacketFrameInput, packets [][]b
if len(vpnID) > 0xffff {
return nil, fmt.Errorf("%w: vpn connection id too long", ErrFabricVPNPacketPayload)
}
var metadata []byte
if len(packets) == 0 {
var err error
metadata, err = encodeFabricVPNPacketServiceMetadata(input)
if err != nil {
return nil, err
}
}
now := input.Now.UTC()
if now.IsZero() {
now = time.Now().UTC()
}
total := fabricVPNPacketPayloadHeader + len(vpnID)
total := fabricVPNPacketPayloadHeader + len(vpnID) + len(metadata)
for _, packet := range packets {
total += 4 + len(packet)
}
@@ -108,10 +139,13 @@ func encodeFabricVPNPacketPayload(input FabricVPNPacketFrameInput, packets [][]b
out[5] = directionCode
binary.BigEndian.PutUint16(out[6:8], uint16(len(packets)))
binary.BigEndian.PutUint16(out[8:10], uint16(len(vpnID)))
binary.BigEndian.PutUint16(out[10:12], uint16(len(metadata)))
binary.BigEndian.PutUint64(out[12:20], uint64(now.UnixNano()))
offset := fabricVPNPacketPayloadHeader
copy(out[offset:], vpnID)
offset += len(vpnID)
copy(out[offset:], metadata)
offset += len(metadata)
for _, packet := range packets {
binary.BigEndian.PutUint32(out[offset:offset+4], uint32(len(packet)))
offset += 4
@@ -128,7 +162,8 @@ func decodeFabricVPNPacketPayload(payload []byte) (mesh.VPNPacketBatchPayload, e
if binary.BigEndian.Uint32(payload[0:4]) != fabricVPNPacketPayloadMagic {
return mesh.VPNPacketBatchPayload{}, fmt.Errorf("%w: bad magic", ErrFabricVPNPacketPayload)
}
if payload[4] != fabricVPNPacketPayloadVersion {
version := payload[4]
if version != 1 && version != fabricVPNPacketPayloadVersion {
return mesh.VPNPacketBatchPayload{}, fmt.Errorf("%w: unsupported version %d", ErrFabricVPNPacketPayload, payload[4])
}
direction, err := fabricVPNPacketDirectionName(payload[5])
@@ -137,7 +172,11 @@ func decodeFabricVPNPacketPayload(payload []byte) (mesh.VPNPacketBatchPayload, e
}
packetCount := int(binary.BigEndian.Uint16(payload[6:8]))
vpnIDLength := int(binary.BigEndian.Uint16(payload[8:10]))
if packetCount <= 0 || packetCount > fabricVPNPacketMaxPacketCount {
metadataLength := 0
if version >= 2 {
metadataLength = int(binary.BigEndian.Uint16(payload[10:12]))
}
if packetCount < 0 || packetCount > fabricVPNPacketMaxPacketCount {
return mesh.VPNPacketBatchPayload{}, fmt.Errorf("%w: invalid packet count %d", ErrFabricVPNPacketPayload, packetCount)
}
offset := fabricVPNPacketPayloadHeader
@@ -149,6 +188,16 @@ func decodeFabricVPNPacketPayload(payload []byte) (mesh.VPNPacketBatchPayload, e
if vpnID == "" {
return mesh.VPNPacketBatchPayload{}, fmt.Errorf("%w: empty vpn id", ErrFabricVPNPacketPayload)
}
metadata := fabricVPNPacketServiceMetadata{}
if metadataLength > 0 {
if metadataLength > fabricVPNPacketMaxMetadataBytes || len(payload) < offset+metadataLength {
return mesh.VPNPacketBatchPayload{}, fmt.Errorf("%w: truncated service metadata", ErrFabricVPNPacketPayload)
}
if err := json.Unmarshal(payload[offset:offset+metadataLength], &metadata); err != nil {
return mesh.VPNPacketBatchPayload{}, fmt.Errorf("%w: invalid service metadata: %v", ErrFabricVPNPacketPayload, err)
}
offset += metadataLength
}
packets := make([][]byte, 0, packetCount)
for index := 0; index < packetCount; index++ {
if len(payload) < offset+4 {
@@ -169,12 +218,74 @@ func decodeFabricVPNPacketPayload(payload []byte) (mesh.VPNPacketBatchPayload, e
return mesh.VPNPacketBatchPayload{
SchemaVersion: "rap.vpn_packet_batch.fabric.v1",
VPNConnectionID: vpnID,
TunnelID: firstNonEmptyTunnelString(metadata.TunnelID, vpnID),
PoolID: metadata.PoolID,
ServiceID: metadata.ServiceID,
LocalServiceID: metadata.LocalServiceID,
RemoteServiceID: metadata.RemoteServiceID,
ServiceKind: metadata.ServiceKind,
ServiceClass: metadata.ServiceClass,
ServiceRole: metadata.ServiceRole,
RouteLeaseID: metadata.RouteLeaseID,
RouteGeneration: metadata.RouteGeneration,
DataPlane: metadata.DataPlane,
TransportOwner: metadata.TransportOwner,
RouteVisibility: metadata.RouteVisibility,
TrafficClasses: metadata.TrafficClasses,
StreamShards: metadata.StreamShards,
Direction: direction,
Packets: packets,
SentAt: sentAt,
}, nil
}
type fabricVPNPacketServiceMetadata struct {
TunnelID string `json:"tunnel_id,omitempty"`
PoolID string `json:"pool_id,omitempty"`
ServiceID string `json:"service_id,omitempty"`
LocalServiceID string `json:"local_service_id,omitempty"`
RemoteServiceID string `json:"remote_service_id,omitempty"`
ServiceKind string `json:"service_kind,omitempty"`
ServiceClass string `json:"service_class,omitempty"`
ServiceRole string `json:"service_role,omitempty"`
RouteLeaseID string `json:"route_lease_id,omitempty"`
RouteGeneration string `json:"route_generation,omitempty"`
DataPlane string `json:"data_plane,omitempty"`
TransportOwner string `json:"transport_owner,omitempty"`
RouteVisibility string `json:"route_visibility,omitempty"`
TrafficClasses []string `json:"traffic_classes,omitempty"`
StreamShards int `json:"stream_shards,omitempty"`
}
func encodeFabricVPNPacketServiceMetadata(input FabricVPNPacketFrameInput) ([]byte, error) {
tunnel := NormalizeServiceTunnel(input.ServiceTunnel, input.VPNConnectionID)
metadata := fabricVPNPacketServiceMetadata{
TunnelID: firstNonEmptyTunnelString(tunnel.TunnelID, input.VPNConnectionID),
PoolID: tunnel.PoolID,
ServiceID: tunnel.ServiceID,
LocalServiceID: tunnel.LocalServiceID,
RemoteServiceID: tunnel.RemoteServiceID,
ServiceKind: tunnel.ServiceKind,
ServiceClass: tunnel.ServiceClass,
ServiceRole: tunnel.ServiceRole,
RouteLeaseID: tunnel.RouteLeaseID,
RouteGeneration: tunnel.RouteGeneration,
DataPlane: tunnel.DataPlane,
TransportOwner: tunnel.TransportOwner,
RouteVisibility: tunnel.RouteVisibility,
TrafficClasses: append([]string(nil), tunnel.TrafficClasses...),
StreamShards: tunnel.StreamShards,
}
payload, err := json.Marshal(metadata)
if err != nil {
return nil, err
}
if len(payload) > fabricVPNPacketMaxMetadataBytes || len(payload) > 0xffff {
return nil, fmt.Errorf("%w: service metadata too large", ErrFabricVPNPacketPayload)
}
return payload, nil
}
func fabricVPNPacketDirectionCode(direction string) (uint8, error) {
switch direction {
case FabricDirectionClientToGateway:
@@ -201,6 +312,8 @@ func fabricFrameTrafficClass(trafficClass string, packets [][]byte) fabricproto.
switch normalizeFabricTrafficClass(trafficClass) {
case FabricTrafficClassControl:
return fabricproto.TrafficClassControl
case FabricTrafficClassDNS:
return fabricproto.TrafficClassReliable
case FabricTrafficClassInteractive:
return fabricproto.TrafficClassInteractive
case FabricTrafficClassReliable:
@@ -208,9 +321,6 @@ func fabricFrameTrafficClass(trafficClass string, packets [][]byte) fabricproto.
case FabricTrafficClassDroppable:
return fabricproto.TrafficClassDroppable
default:
if batchHasTCPControlPacket(packets) {
return fabricproto.TrafficClassInteractive
}
return fabricproto.TrafficClassBulk
}
}
@@ -14,11 +14,16 @@ type FabricSessionFrameWriter interface {
}
type FabricSessionPacketPeerRegistry struct {
mu sync.RWMutex
peers map[string]FabricSessionPacketPeer
mu sync.RWMutex
peers map[string]FabricSessionPacketPeer
changed chan struct{}
}
type FabricSessionPacketPeer struct {
TunnelID string
PoolID string
ServiceID string
ServiceTunnel FabricServiceTunnel
VPNConnectionID string
Sender FabricSessionFrameWriter
StreamID uint64
@@ -30,11 +35,17 @@ type FabricSessionPacketPeer struct {
type FabricSessionPacketPeerTransport struct {
Registry *FabricSessionPacketPeerRegistry
Inbox *FabricPacketInbox
TunnelID string
PoolID string
ServiceID string
VPNConnectionID string
PeerWaitTimeout time.Duration
}
const defaultFabricSessionPeerWaitTimeout = 500 * time.Millisecond
func NewFabricSessionPacketPeerRegistry() *FabricSessionPacketPeerRegistry {
return &FabricSessionPacketPeerRegistry{peers: map[string]FabricSessionPacketPeer{}}
return &FabricSessionPacketPeerRegistry{peers: map[string]FabricSessionPacketPeer{}, changed: make(chan struct{})}
}
func (r *FabricSessionPacketPeerRegistry) RegisterFrame(ctx context.Context, sender FabricSessionFrameWriter, frame fabricproto.Frame) (bool, error) {
@@ -53,10 +64,33 @@ func (r *FabricSessionPacketPeerRegistry) RegisterFrame(ctx context.Context, sen
if r.peers == nil {
r.peers = map[string]FabricSessionPacketPeer{}
}
if r.changed == nil {
r.changed = make(chan struct{})
}
peer := r.peers[payload.VPNConnectionID]
if peer.RegisteredAt.IsZero() {
peer.RegisteredAt = now
}
peer.ServiceTunnel = NormalizeServiceTunnel(FabricServiceTunnel{
TunnelID: firstNonEmptyTunnelString(payload.TunnelID, payload.VPNConnectionID),
PoolID: payload.PoolID,
ServiceID: payload.ServiceID,
LocalServiceID: payload.LocalServiceID,
RemoteServiceID: payload.RemoteServiceID,
ServiceKind: payload.ServiceKind,
ServiceClass: payload.ServiceClass,
ServiceRole: payload.ServiceRole,
RouteLeaseID: payload.RouteLeaseID,
RouteGeneration: payload.RouteGeneration,
DataPlane: payload.DataPlane,
TransportOwner: payload.TransportOwner,
RouteVisibility: payload.RouteVisibility,
TrafficClasses: payload.TrafficClasses,
StreamShards: payload.StreamShards,
}, payload.VPNConnectionID)
peer.TunnelID = peer.ServiceTunnel.TunnelID
peer.PoolID = peer.ServiceTunnel.PoolID
peer.ServiceID = peer.ServiceTunnel.ServiceID
peer.VPNConnectionID = payload.VPNConnectionID
peer.Sender = sender
peer.StreamID = frame.StreamID
@@ -69,6 +103,7 @@ func (r *FabricSessionPacketPeerRegistry) RegisterFrame(ctx context.Context, sen
peer.StreamIDsByTrafficClass[trafficClass] = append(peer.StreamIDsByTrafficClass[trafficClass], frame.StreamID)
}
r.peers[payload.VPNConnectionID] = peer
r.signalLocked()
r.mu.Unlock()
return true, nil
}
@@ -84,25 +119,93 @@ func (r *FabricSessionPacketPeerRegistry) TransportFor(vpnConnectionID string, i
return nil
}
return &FabricSessionPacketTransport{
Sender: fabricSessionFrameWriterAdapter{writer: peer.Sender},
Inbox: inbox,
StreamID: peer.StreamID,
StreamIDsByTrafficClass: copyStreamIDsByClass(peer.StreamIDsByTrafficClass),
VPNConnectionID: vpnConnectionID,
SendDirection: FabricDirectionGatewayToClient,
ReceiveDirection: FabricDirectionClientToGateway,
Sender: fabricSessionFrameWriterAdapter{writer: peer.Sender},
Inbox: inbox,
StreamID: peer.StreamID,
ServiceTunnel: peer.ServiceTunnel,
TunnelID: vpnConnectionID,
PoolID: peer.PoolID,
ServiceID: peer.ServiceID,
VPNConnectionID: vpnConnectionID,
SendDirection: FabricDirectionGatewayToClient,
ReceiveDirection: FabricDirectionClientToGateway,
}
}
func (r *FabricSessionPacketPeerRegistry) WaitTransportFor(ctx context.Context, vpnConnectionID string, inbox *FabricPacketInbox, timeout time.Duration) PacketTransport {
if timeout <= 0 {
return r.TransportFor(vpnConnectionID, inbox)
}
timer := time.NewTimer(timeout)
defer timer.Stop()
for {
if transport := r.TransportFor(vpnConnectionID, inbox); transport != nil {
return transport
}
changed := r.changedChannel()
select {
case <-ctx.Done():
return nil
case <-timer.C:
return nil
case <-changed:
}
}
}
func (r *FabricSessionPacketPeerRegistry) Forget(vpnConnectionID string) {
if r == nil || vpnConnectionID == "" {
return
}
r.mu.Lock()
if r.changed == nil {
r.changed = make(chan struct{})
}
delete(r.peers, vpnConnectionID)
r.signalLocked()
r.mu.Unlock()
}
func (r *FabricSessionPacketPeerRegistry) changedChannel() <-chan struct{} {
if r == nil {
return nil
}
r.mu.Lock()
defer r.mu.Unlock()
if r.changed == nil {
r.changed = make(chan struct{})
}
return r.changed
}
func (r *FabricSessionPacketPeerRegistry) signalLocked() {
if r == nil {
return
}
if r.changed == nil {
r.changed = make(chan struct{})
}
close(r.changed)
r.changed = make(chan struct{})
}
func (t *FabricSessionPacketPeerTransport) SendGatewayPacketBatch(ctx context.Context, packets [][]byte) error {
if t == nil || t.Registry == nil || t.Inbox == nil || t.VPNConnectionID == "" {
return mesh.ErrForwardRuntimeUnavailable
}
transport := t.Registry.TransportFor(t.VPNConnectionID, t.Inbox)
waitTimeout := t.PeerWaitTimeout
if waitTimeout <= 0 {
waitTimeout = defaultFabricSessionPeerWaitTimeout
}
transport := t.Registry.WaitTransportFor(ctx, t.VPNConnectionID, t.Inbox, waitTimeout)
if transport == nil {
return mesh.ErrForwardRuntimeUnavailable
}
return transport.SendGatewayPacketBatch(ctx, packets)
if err := transport.SendGatewayPacketBatch(ctx, packets); err != nil {
t.Registry.Forget(t.VPNConnectionID)
return err
}
return nil
}
func (t *FabricSessionPacketPeerTransport) ReceiveGatewayPacketBatch(ctx context.Context, timeout time.Duration) ([][]byte, error) {
@@ -126,9 +229,12 @@ func (t *FabricSessionPacketPeerTransport) Snapshot() map[string]any {
}
}
return map[string]any{
"transport": "fabric_session_peer_dynamic",
"vpn_connection_id": t.VPNConnectionID,
"peer_ready": ready == 1,
"transport": "fabric_session_peer_dynamic",
"tunnel_id": firstNonEmptyTunnelString(t.TunnelID, t.VPNConnectionID),
"pool_id": t.PoolID,
"service_id": t.ServiceID,
"vpn_connection_id_alias": t.VPNConnectionID,
"peer_ready": ready == 1,
}
}
@@ -142,8 +248,12 @@ func (r *FabricSessionPacketPeerRegistry) Snapshot() map[string]any {
items := make([]map[string]any, 0, len(r.peers))
for _, peer := range r.peers {
item := map[string]any{
"vpn_connection_id": peer.VPNConnectionID,
"stream_id": peer.StreamID,
"tunnel_id": firstNonEmptyTunnelString(peer.TunnelID, peer.VPNConnectionID),
"pool_id": peer.PoolID,
"service_id": peer.ServiceID,
"vpn_connection_id_alias": peer.VPNConnectionID,
"service_tunnel": peer.ServiceTunnel.Snapshot(),
"stream_id": peer.StreamID,
}
if !peer.RegisteredAt.IsZero() {
item["registered_at"] = peer.RegisteredAt.Format(time.RFC3339Nano)
@@ -31,6 +31,11 @@ type FabricSessionPacketTransport struct {
Inbox *FabricPacketInbox
StreamID uint64
ServiceStreams *FabricServiceStreamRegistry
ServiceTunnel FabricServiceTunnel
TunnelID string
PoolID string
ServiceID string
VPNConnectionID string
SendDirection string
ReceiveDirection string
@@ -39,6 +44,12 @@ type FabricSessionPacketTransport struct {
StreamIDsByTrafficClass map[string][]uint64
StreamIDs []uint64
routeMu sync.Mutex
routeLeaseID string
routeGeneration string
routeTransitionCount uint64
routeUpdatedAt time.Time
sequence uint64
sequenceMu sync.Mutex
sequenceByStream map[uint64]uint64
@@ -68,7 +79,12 @@ func (t *FabricSessionPacketTransport) SendGatewayPacketBatch(ctx context.Contex
if t == nil || t.Sender == nil {
return mesh.ErrForwardRuntimeUnavailable
}
if !t.hasSendStream() || t.VPNConnectionID == "" {
t.normalizeServiceTunnel()
packetTunnelID := t.packetTunnelID()
if t.VPNConnectionID == "" {
t.VPNConnectionID = packetTunnelID
}
if !t.hasSendStream() || packetTunnelID == "" {
return errors.New("fabric session packet transport identity is incomplete")
}
direction := t.SendDirection
@@ -77,12 +93,14 @@ func (t *FabricSessionPacketTransport) SendGatewayPacketBatch(ctx context.Contex
}
groups := t.groupPacketsByStream(packets)
for _, group := range groups {
t.registerServiceStream(group.StreamID, group.TrafficClass, direction)
frame, err := NewFabricVPNPacketDataFrame(FabricVPNPacketFrameInput{
StreamID: group.StreamID,
Sequence: t.nextSequence(group.StreamID),
VPNConnectionID: t.VPNConnectionID,
VPNConnectionID: packetTunnelID,
Direction: direction,
TrafficClass: group.TrafficClass,
ServiceTunnel: t.ServiceTunnel,
Packets: group.Packets,
})
if err != nil {
@@ -101,15 +119,17 @@ func (t *FabricSessionPacketTransport) ReceiveGatewayPacketBatch(ctx context.Con
if t == nil || t.Inbox == nil {
return nil, mesh.ErrForwardRuntimeUnavailable
}
t.normalizeServiceTunnel()
packetTunnelID := t.packetTunnelID()
direction := t.ReceiveDirection
if direction == "" {
direction = FabricDirectionClientToGateway
}
if packets, err := t.Inbox.Receive(ctx, t.VPNConnectionID, direction, 5*time.Millisecond); err != nil || len(packets) > 0 {
if packets, err := t.Inbox.Receive(ctx, packetTunnelID, direction, 5*time.Millisecond); err != nil || len(packets) > 0 {
return packets, err
}
if t.Receiver == nil {
return t.Inbox.Receive(ctx, t.VPNConnectionID, direction, timeout)
return t.Inbox.Receive(ctx, packetTunnelID, direction, timeout)
}
if timeout <= 0 {
timeout = 25 * time.Second
@@ -130,14 +150,14 @@ func (t *FabricSessionPacketTransport) ReceiveGatewayPacketBatch(ctx context.Con
continue
}
if err != nil {
if packets, receiveErr := t.Inbox.Receive(ctx, t.VPNConnectionID, direction, 100*time.Millisecond); receiveErr != nil || len(packets) > 0 {
if packets, receiveErr := t.Inbox.Receive(ctx, packetTunnelID, direction, 100*time.Millisecond); receiveErr != nil || len(packets) > 0 {
return packets, receiveErr
}
return nil, err
}
case frame, ok := <-frames:
if !ok {
return t.Inbox.Receive(ctx, t.VPNConnectionID, direction, 100*time.Millisecond)
return t.Inbox.Receive(ctx, packetTunnelID, direction, 100*time.Millisecond)
}
if frame.Type != fabricproto.FrameData || !t.acceptsStream(frame.StreamID) {
continue
@@ -146,7 +166,7 @@ func (t *FabricSessionPacketTransport) ReceiveGatewayPacketBatch(ctx context.Con
if err != nil {
return nil, err
}
if payload.VPNConnectionID == t.VPNConnectionID && payload.Direction == direction {
if payload.VPNConnectionID == packetTunnelID && payload.Direction == direction {
t.recordReceive(frame.StreamID, fabricSessionTrafficClassName(frame.TrafficClass), len(payload.Packets))
return cleanPacketBatch(payload.Packets), nil
}
@@ -222,7 +242,8 @@ func (t *FabricSessionPacketTransport) Close() error {
if t.closeErr == nil {
t.closeErr = err
}
} else if err == nil {
} else {
t.markServiceStreamClosed(streamID)
t.recordCloseStream()
}
}
@@ -334,7 +355,13 @@ func (t *FabricSessionPacketTransport) streamIDsForTrafficClass(trafficClass str
if ids := t.StreamIDsByTrafficClass[normalizeFabricTrafficClass(trafficClass)]; len(ids) > 0 {
return ids
}
if normalizeFabricTrafficClass(trafficClass) == FabricTrafficClassReliable {
switch normalizeFabricTrafficClass(trafficClass) {
case FabricTrafficClassDNS:
if ids := t.StreamIDsByTrafficClass[FabricTrafficClassReliable]; len(ids) > 0 {
return ids
}
return t.StreamIDsByTrafficClass[FabricTrafficClassBulk]
case FabricTrafficClassReliable:
return t.StreamIDsByTrafficClass[FabricTrafficClassBulk]
}
return nil
@@ -444,6 +471,7 @@ func (t *FabricSessionPacketTransport) Snapshot() map[string]any {
if t == nil {
return nil
}
t.normalizeServiceTunnel()
t.statsMu.Lock()
sendFramesByClass := copyStringUint64Map(t.sendFramesByClass)
sendPacketsByClass := copyStringUint64Map(t.sendPacketsByClass)
@@ -471,9 +499,23 @@ func (t *FabricSessionPacketTransport) Snapshot() map[string]any {
receivePacketsByStream[fmt.Sprintf("%d", streamID)] = count
}
t.statsMu.Unlock()
t.routeMu.Lock()
routeLeaseID := firstNonEmptyTunnelString(t.routeLeaseID, t.ServiceTunnel.RouteLeaseID)
routeGeneration := firstNonEmptyTunnelString(t.routeGeneration, t.ServiceTunnel.RouteGeneration)
routeTransitionCount := t.routeTransitionCount
routeUpdatedAt := t.routeUpdatedAt
t.routeMu.Unlock()
streamIDsByClass := copyStreamIDsByTrafficClass(t.StreamIDsByTrafficClass)
return map[string]any{
out := map[string]any{
"schema_version": "rap.vpn_fabric_session_packet_transport.v1",
"tunnel_id": t.packetTunnelID(),
"pool_id": t.PoolID,
"service_id": t.ServiceID,
"route_lease_id": routeLeaseID,
"route_generation": routeGeneration,
"route_transition_count": routeTransitionCount,
"vpn_connection_id_alias": t.VPNConnectionID,
"service_tunnel": t.ServiceTunnel.Snapshot(),
"stream_id": t.StreamID,
"stream_ids_by_class": streamIDsByClass,
"stream_class_count": len(streamIDsByClass),
@@ -495,6 +537,92 @@ func (t *FabricSessionPacketTransport) Snapshot() map[string]any {
"receive_frames_by_stream_id": receiveFramesByStream,
"receive_packets_by_stream_id": receivePacketsByStream,
}
if t.ServiceStreams != nil {
out["service_stream_registry"] = t.ServiceStreams.Snapshot()
out["service_streams"] = serviceStreamsSnapshotItems(t.ServiceStreams.StreamsForTunnel(t.packetTunnelID()))
}
if !routeUpdatedAt.IsZero() {
out["route_updated_at"] = routeUpdatedAt.UTC().Format(time.RFC3339Nano)
}
return out
}
func (t *FabricSessionPacketTransport) UpdateServiceTunnel(tunnel FabricServiceTunnel) (bool, error) {
if t == nil {
return false, mesh.ErrForwardRuntimeUnavailable
}
currentID := t.packetTunnelID()
tunnel = NormalizeServiceTunnel(tunnel, currentID)
if currentID != "" && tunnel.TunnelID != "" && tunnel.TunnelID != currentID {
return false, fmt.Errorf("service tunnel id changed from %q to %q", currentID, tunnel.TunnelID)
}
t.routeMu.Lock()
defer t.routeMu.Unlock()
previousLeaseID := firstNonEmptyTunnelString(t.routeLeaseID, t.ServiceTunnel.RouteLeaseID)
previousGeneration := firstNonEmptyTunnelString(t.routeGeneration, t.ServiceTunnel.RouteGeneration)
changed := previousLeaseID != tunnel.RouteLeaseID || previousGeneration != tunnel.RouteGeneration
t.ServiceTunnel = tunnel
t.TunnelID = firstNonEmptyTunnelString(t.TunnelID, tunnel.TunnelID)
t.PoolID = firstNonEmptyTunnelString(tunnel.PoolID, t.PoolID)
t.ServiceID = firstNonEmptyTunnelString(tunnel.ServiceID, t.ServiceID)
t.routeLeaseID = tunnel.RouteLeaseID
t.routeGeneration = tunnel.RouteGeneration
if changed {
t.routeTransitionCount++
t.routeUpdatedAt = time.Now().UTC()
}
return changed, nil
}
func (t *FabricSessionPacketTransport) normalizeServiceTunnel() {
if t == nil {
return
}
fallbackID := firstNonEmptyTunnelString(t.ServiceTunnel.TunnelID, t.TunnelID, t.VPNConnectionID)
t.ServiceTunnel = NormalizeServiceTunnel(t.ServiceTunnel, fallbackID)
t.TunnelID = firstNonEmptyTunnelString(t.TunnelID, t.ServiceTunnel.TunnelID)
t.PoolID = firstNonEmptyTunnelString(t.PoolID, t.ServiceTunnel.PoolID)
t.ServiceID = firstNonEmptyTunnelString(t.ServiceID, t.ServiceTunnel.ServiceID)
t.routeMu.Lock()
if t.routeLeaseID == "" {
t.routeLeaseID = t.ServiceTunnel.RouteLeaseID
}
if t.routeGeneration == "" {
t.routeGeneration = t.ServiceTunnel.RouteGeneration
}
t.routeMu.Unlock()
}
func (t *FabricSessionPacketTransport) packetTunnelID() string {
if t == nil {
return ""
}
return firstNonEmptyTunnelString(t.ServiceTunnel.TunnelID, t.TunnelID, t.VPNConnectionID)
}
func (t *FabricSessionPacketTransport) registerServiceStream(streamID uint64, trafficClass string, direction string) {
if t == nil || t.ServiceStreams == nil || streamID == 0 {
return
}
t.normalizeServiceTunnel()
t.ServiceStreams.Register(FabricServiceStream{
TunnelID: t.packetTunnelID(),
ServiceID: t.ServiceID,
StreamID: streamID,
TrafficClass: trafficClass,
Direction: direction,
ServiceTunnel: t.ServiceTunnel,
Metadata: map[string]string{
"adapter": "vpn",
},
})
}
func (t *FabricSessionPacketTransport) markServiceStreamClosed(streamID uint64) {
if t == nil || t.ServiceStreams == nil || streamID == 0 {
return
}
t.ServiceStreams.MarkClosed(t.packetTunnelID(), streamID)
}
func (t *FabricSessionPacketTransport) recordCloseStream() {
@@ -516,12 +644,9 @@ func (t *FabricSessionPacketTransport) recordCloseError() {
}
func fabricSessionTrafficClassForPackets(fallback string, packets [][]byte) string {
if fallback = normalizeFabricTrafficClass(fallback); fallback != "" && fallback != FabricTrafficClassBulk {
if fallback = normalizeFabricTrafficClass(fallback); fallback != "" {
return fallback
}
if batchHasTCPControlPacket(packets) {
return FabricTrafficClassInteractive
}
return FabricTrafficClassBulk
}
@@ -35,6 +35,9 @@ type FabricPacketTransport struct {
Inbox *FabricPacketInbox
ClusterID string
TunnelID string
PoolID string
ServiceID string
VPNConnectionID string
RouteID string
LocalNodeID string
@@ -46,16 +49,16 @@ type FabricPacketTransport struct {
}
type FabricClientPacketIngress struct {
ForwardTransport mesh.ProductionForwardTransport
Inbox *FabricPacketInbox
Routes func() []mesh.SyntheticRoute
LocalGateway func(vpnConnectionID string) bool
AllowLegacyLocalGatewayFallback bool
FlowScheduler *FabricFlowScheduler
MaxParallelFlowSends int
RecoveryPolicyFingerprint string
AdaptivePolicyFingerprint string
PreventLastRouteWithdrawal bool
ForwardTransport mesh.ProductionForwardTransport
Inbox *FabricPacketInbox
Routes func() []mesh.SyntheticRoute
LocalGateway func(vpnConnectionID string) bool
AllowLocalGatewayBypass bool
FlowScheduler *FabricFlowScheduler
MaxParallelFlowSends int
RecoveryPolicyFingerprint string
AdaptivePolicyFingerprint string
PreventLastRouteWithdrawal bool
ClusterID string
LocalNodeID string
@@ -159,6 +162,7 @@ type FabricServiceChannelAdaptivePolicy struct {
const (
FabricTrafficClassControl = "control"
FabricTrafficClassDNS = "dns"
FabricTrafficClassInteractive = "interactive"
FabricTrafficClassReliable = "reliable"
FabricTrafficClassBulk = "bulk"
@@ -370,6 +374,7 @@ func defaultFabricServiceChannelAdaptivePolicy() FabricServiceChannelAdaptivePol
QueuePressureMaxInFlight: defaultFabricFlowParallelSendWindow * 4,
ClassWindows: map[string]int{
FabricTrafficClassControl: defaultFabricFlowParallelSendWindow,
FabricTrafficClassDNS: defaultFabricFlowParallelSendWindow,
FabricTrafficClassInteractive: defaultFabricFlowParallelSendWindow,
FabricTrafficClassReliable: 6,
FabricTrafficClassBulk: 4,
@@ -399,6 +404,7 @@ func normalizeFabricServiceChannelAdaptivePolicy(policy FabricServiceChannelAdap
}
defaults := map[string]int{
FabricTrafficClassControl: policy.MaxParallelWindow,
FabricTrafficClassDNS: policy.MaxParallelWindow,
FabricTrafficClassInteractive: policy.MaxParallelWindow,
FabricTrafficClassReliable: minPositive(policy.MaxParallelWindow, 6),
FabricTrafficClassBulk: minPositive(policy.MaxParallelWindow, 4),
@@ -466,7 +472,7 @@ func (s *FabricFlowScheduler) scheduleClientPackets(vpnConnectionID string, traf
FlowID: flowID,
Shard: shard,
TrafficClass: trafficClass,
Classifier: "ip_5tuple_or_packet_hash",
Classifier: "opaque_packet_hash",
ServiceMode: "application_protocol_agnostic",
}
grouped[channelID] = batch
@@ -1277,6 +1283,8 @@ func normalizeFabricTrafficClass(value string) string {
switch strings.TrimSpace(strings.ToLower(value)) {
case FabricTrafficClassControl:
return FabricTrafficClassControl
case FabricTrafficClassDNS:
return FabricTrafficClassDNS
case FabricTrafficClassInteractive:
return FabricTrafficClassInteractive
case FabricTrafficClassReliable:
@@ -1294,16 +1302,18 @@ func fabricTrafficClassPriority(value string) int {
switch normalizeFabricTrafficClass(value) {
case FabricTrafficClassControl:
return 0
case FabricTrafficClassInteractive:
case FabricTrafficClassDNS:
return 1
case FabricTrafficClassReliable:
case FabricTrafficClassInteractive:
return 2
case FabricTrafficClassReliable:
return 3
case FabricTrafficClassBulk:
return 3
case FabricTrafficClassDroppable:
return 4
case FabricTrafficClassDroppable:
return 5
default:
return 3
return 4
}
}
@@ -1932,7 +1942,7 @@ func (i *FabricClientPacketIngress) ReceiveClientPacketBatch(ctx context.Context
}
func (i *FabricClientPacketIngress) localGatewayReady(vpnConnectionID string) bool {
if i == nil || !i.AllowLegacyLocalGatewayFallback || i.inbox() == nil || vpnConnectionID == "" {
if i == nil || !i.AllowLocalGatewayBypass || i.inbox() == nil || vpnConnectionID == "" {
return false
}
localGateway := i.localGateway()
@@ -2224,9 +2234,6 @@ func (i *FabricPacketInbox) Receive(ctx context.Context, vpnConnectionID, direct
func (i *FabricPacketInbox) enqueue(payload mesh.VPNPacketBatchPayload) error {
queue := i.queue(payload.VPNConnectionID, payload.Direction)
target := queue.normal
if payload.Direction == FabricDirectionGatewayToClient && batchHasTCPControlPacket(payload.Packets) {
target = queue.priority
}
select {
case target <- payload:
default:
@@ -2256,15 +2263,6 @@ func (i *FabricPacketInbox) queue(vpnConnectionID, direction string) *fabricPack
return queue
}
func batchHasTCPControlPacket(packets [][]byte) bool {
for _, packet := range packets {
if isTCPControlPacket(packet) {
return true
}
}
return false
}
func maxInt(a, b int) int {
if a > b {
return a
@@ -2976,7 +2974,7 @@ func classifyPacketFlow(packet []byte, shardCount int) (string, int) {
if shardCount <= 0 {
shardCount = defaultFabricFlowShardCount
}
key := packetFlowKey(packet)
key := packetHashFlowKey("opaque", packet)
hash := fnv.New32a()
_, _ = hash.Write([]byte(key))
shard := int(hash.Sum32() % uint32(shardCount))
@@ -234,6 +234,7 @@ func TestFabricSessionPacketTransportSendsDataFrame(t *testing.T) {
}
func TestFabricSessionPacketTransportShardsStreamsByTrafficClass(t *testing.T) {
t.Skip("retired: base VPN fabric channel is opaque and no longer classifies TCP control packets")
sender := &captureFabricSessionSender{}
transport := &FabricSessionPacketTransport{
Sender: sender,
@@ -284,7 +285,245 @@ func TestFabricSessionPacketTransportShardsStreamsByTrafficClass(t *testing.T) {
}
}
func TestFabricSessionPacketTransportUsesTunnelIDAsServiceIdentity(t *testing.T) {
sender := &captureFabricSessionSender{}
transport := &FabricSessionPacketTransport{
Sender: sender,
StreamID: 700,
TunnelID: "fabric-tunnel-1",
VPNConnectionID: "legacy-vpn-1",
SendDirection: FabricDirectionClientToGateway,
ServiceTunnel: FabricServiceTunnel{
TunnelID: "fabric-tunnel-1",
PoolID: "ipv4-egress",
ServiceID: "svc-vpn-1",
ServiceKind: "ipv4-tunnel",
ServiceClass: "vpn_packets",
},
}
if err := transport.SendGatewayPacketBatch(context.Background(), [][]byte{[]byte("packet")}); err != nil {
t.Fatalf("send packet: %v", err)
}
if len(sender.frames) != 1 {
t.Fatalf("sent frames = %d, want 1", len(sender.frames))
}
payload, err := DecodeFabricVPNPacketDataFrame(sender.frames[0])
if err != nil {
t.Fatalf("decode payload: %v", err)
}
if payload.VPNConnectionID != "fabric-tunnel-1" {
t.Fatalf("payload tunnel identity = %q, want fabric-tunnel-1", payload.VPNConnectionID)
}
if payload.TunnelID != "fabric-tunnel-1" || payload.PoolID != "" || payload.ServiceID != "" {
t.Fatalf("hot data frame should carry only tunnel identity, got %+v", payload)
}
snapshot := transport.Snapshot()
if snapshot["tunnel_id"] != "fabric-tunnel-1" || snapshot["vpn_connection_id_alias"] != "legacy-vpn-1" {
t.Fatalf("snapshot should expose tunnel id and legacy alias: %+v", snapshot)
}
serviceTunnel, ok := snapshot["service_tunnel"].(map[string]any)
if !ok || serviceTunnel["transport_owner"] != DefaultFabricTransportOwner || serviceTunnel["route_visibility"] != DefaultFabricRouteVisibility {
t.Fatalf("service tunnel snapshot missing fabric ownership: %+v", snapshot["service_tunnel"])
}
}
func TestFabricSessionPacketTransportUsesOpaqueBulkChannelForPacketContents(t *testing.T) {
sender := &captureFabricSessionSender{}
transport := &FabricSessionPacketTransport{
Sender: sender,
TunnelID: "fabric-tunnel-1",
VPNConnectionID: "legacy-vpn-1",
SendDirection: FabricDirectionClientToGateway,
StreamIDsByTrafficClass: map[string][]uint64{
FabricTrafficClassReliable: []uint64{701},
FabricTrafficClassInteractive: []uint64{801},
FabricTrafficClassBulk: []uint64{901},
},
}
dns := testDNSIPv4PacketForFabricRuntime()
tcpControl := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51001, 3389)
tcpControl[33] = 0x02
if err := transport.SendGatewayPacketBatch(context.Background(), [][]byte{dns, tcpControl}); err != nil {
t.Fatalf("send opaque packets: %v", err)
}
if len(sender.frames) != 1 {
t.Fatalf("frames = %d, want one opaque bulk frame", len(sender.frames))
}
if sender.frames[0].TrafficClass != fabricproto.TrafficClassBulk || sender.frames[0].StreamID != 901 {
t.Fatalf("opaque packets should use bulk stream without protocol analysis: %+v", sender.frames[0])
}
payload, err := DecodeFabricVPNPacketDataFrame(sender.frames[0])
if err != nil {
t.Fatalf("decode opaque frame: %v", err)
}
if len(payload.Packets) != 2 {
t.Fatalf("opaque frame packets = %d, want 2", len(payload.Packets))
}
}
func TestFabricSessionPacketPeerRegistryKeepsServiceTunnelFromHello(t *testing.T) {
registry := NewFabricSessionPacketPeerRegistry()
sender := &recordingFrameSender{}
frame, err := NewFabricVPNSessionHelloFrame(FabricVPNPacketFrameInput{
StreamID: 711,
VPNConnectionID: "fabric-tunnel-1",
Direction: FabricDirectionClientToGateway,
TrafficClass: FabricTrafficClassInteractive,
ServiceTunnel: FabricServiceTunnel{
TunnelID: "fabric-tunnel-1",
PoolID: "home-ipv4",
ServiceID: "svc-vpn-1",
ServiceKind: "ipv4-tunnel",
ServiceClass: "vpn_packets",
ServiceRole: "ipv4-egress",
RouteLeaseID: "lease-1",
RouteGeneration: "route-gen-1",
},
})
if err != nil {
t.Fatalf("hello frame: %v", err)
}
handled, err := registry.RegisterFrame(context.Background(), sender, frame)
if err != nil || !handled {
t.Fatalf("register hello handled=%v err=%v", handled, err)
}
snapshot := registry.Snapshot()
peers := snapshot["peers"].([]map[string]any)
if len(peers) != 1 {
t.Fatalf("peers = %+v", peers)
}
serviceTunnel := peers[0]["service_tunnel"].(map[string]any)
if serviceTunnel["pool_id"] != "home-ipv4" ||
serviceTunnel["service_id"] != "svc-vpn-1" ||
serviceTunnel["route_visibility"] != DefaultFabricRouteVisibility ||
serviceTunnel["route_lease_id"] != "lease-1" ||
serviceTunnel["route_generation"] != "route-gen-1" {
t.Fatalf("peer service tunnel not preserved: %+v", serviceTunnel)
}
}
func TestFabricSessionPacketTransportRegistersServiceStreams(t *testing.T) {
t.Skip("retired: base VPN fabric channel is opaque and no longer derives service stream class from packet contents")
sender := &captureFabricSessionSender{}
registry := NewFabricServiceStreamRegistry()
transport := &FabricSessionPacketTransport{
Sender: sender,
TunnelID: "fabric-tunnel-1",
VPNConnectionID: "legacy-vpn-1",
ServiceID: "svc-vpn-1",
SendDirection: FabricDirectionClientToGateway,
ServiceStreams: registry,
StreamIDsByTrafficClass: map[string][]uint64{
FabricTrafficClassInteractive: []uint64{801},
FabricTrafficClassBulk: []uint64{901},
},
ServiceTunnel: FabricServiceTunnel{
TunnelID: "fabric-tunnel-1",
PoolID: "ipv4-egress",
ServiceID: "svc-vpn-1",
ServiceKind: "ipv4-tunnel",
ServiceClass: "vpn_packets",
},
}
packet := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51001, 3389)
packet[33] = 0x02
if err := transport.SendGatewayPacketBatch(context.Background(), [][]byte{packet}); err != nil {
t.Fatalf("send packet: %v", err)
}
streams := registry.StreamsForTunnel("fabric-tunnel-1")
if len(streams) != 1 {
t.Fatalf("registered streams = %+v, want one", streams)
}
if streams[0].StreamID != 801 ||
streams[0].TrafficClass != FabricTrafficClassInteractive ||
streams[0].ServiceID != "svc-vpn-1" ||
streams[0].State != FabricServiceStreamStateOpen {
t.Fatalf("unexpected service stream: %+v", streams[0])
}
snapshot := transport.Snapshot()
serviceStreams, ok := snapshot["service_streams"].([]map[string]any)
if !ok || len(serviceStreams) != 1 || serviceStreams[0]["stream_id"] != uint64(801) {
t.Fatalf("transport snapshot missing service streams: %+v", snapshot["service_streams"])
}
if err := transport.Close(); err != nil {
t.Fatalf("close transport: %v", err)
}
streams = registry.StreamsForTunnel("fabric-tunnel-1")
if len(streams) != 1 || streams[0].State != FabricServiceStreamStateClosed {
t.Fatalf("service stream not closed with transport: %+v", streams)
}
}
func TestFabricSessionPacketTransportUpdatesRouteLeaseWithoutChangingTunnel(t *testing.T) {
transport := &FabricSessionPacketTransport{
TunnelID: "fabric-tunnel-1",
ServiceTunnel: FabricServiceTunnel{
TunnelID: "fabric-tunnel-1",
PoolID: "home-ipv4",
ServiceID: "svc-vpn-1",
RouteLeaseID: "lease-1",
RouteGeneration: "route-gen-1",
},
}
changed, err := transport.UpdateServiceTunnel(FabricServiceTunnel{
TunnelID: "fabric-tunnel-1",
PoolID: "home-ipv4",
ServiceID: "svc-vpn-1",
RouteLeaseID: "lease-2",
RouteGeneration: "route-gen-2",
})
if err != nil || !changed {
t.Fatalf("update service tunnel changed=%v err=%v", changed, err)
}
snapshot := transport.Snapshot()
if snapshot["tunnel_id"] != "fabric-tunnel-1" ||
snapshot["route_lease_id"] != "lease-2" ||
snapshot["route_generation"] != "route-gen-2" ||
snapshot["route_transition_count"] != uint64(1) {
t.Fatalf("route lease update not reflected without tunnel change: %+v", snapshot)
}
if _, err := transport.UpdateServiceTunnel(FabricServiceTunnel{TunnelID: "other-tunnel"}); err == nil {
t.Fatal("expected changing tunnel id to be rejected")
}
}
func TestFabricSessionPacketTransportRoutesDNSOnReliableClass(t *testing.T) {
t.Skip("retired: base VPN fabric channel is opaque and no longer detects DNS packets")
sender := &captureFabricSessionSender{}
registry := NewFabricServiceStreamRegistry()
transport := &FabricSessionPacketTransport{
Sender: sender,
TunnelID: "fabric-tunnel-1",
VPNConnectionID: "legacy-vpn-1",
SendDirection: FabricDirectionClientToGateway,
ServiceStreams: registry,
StreamIDsByTrafficClass: map[string][]uint64{
FabricTrafficClassReliable: []uint64{701},
FabricTrafficClassBulk: []uint64{901},
},
}
if err := transport.SendGatewayPacketBatch(context.Background(), [][]byte{testDNSIPv4PacketForFabricRuntime()}); err != nil {
t.Fatalf("send dns packet: %v", err)
}
if len(sender.frames) != 1 {
t.Fatalf("frames = %d, want 1", len(sender.frames))
}
if sender.frames[0].StreamID != 701 || sender.frames[0].TrafficClass != fabricproto.TrafficClassReliable {
t.Fatalf("dns packet should use reliable stream: %+v", sender.frames[0])
}
streams := registry.StreamsForTunnel("fabric-tunnel-1")
if len(streams) != 1 || streams[0].TrafficClass != FabricTrafficClassDNS {
t.Fatalf("dns service stream not tracked separately: %+v", streams)
}
}
func TestFabricSessionPacketTransportSplitsMixedBatchByStream(t *testing.T) {
t.Skip("retired: base VPN fabric channel is opaque and no longer splits batches by packet protocol")
sender := &captureFabricSessionSender{}
transport := &FabricSessionPacketTransport{
Sender: sender,
@@ -470,15 +709,91 @@ func TestFabricSessionPacketPeerTransportSendsReplyToLatestRegisteredPeer(t *tes
}
}
func TestFabricSessionPacketPeerTransportForgetsClosedPeerAndRebinds(t *testing.T) {
inbox := NewFabricPacketInbox(4)
registry := NewFabricSessionPacketPeerRegistry()
firstSender := &recordingFrameSender{err: errors.New("closed")}
registerFabricSessionPeerForTest(t, registry, firstSender, "vpn-1", 7)
transport := &FabricSessionPacketPeerTransport{
Registry: registry,
Inbox: inbox,
VPNConnectionID: "vpn-1",
PeerWaitTimeout: 250 * time.Millisecond,
}
if err := transport.SendGatewayPacketBatch(context.Background(), [][]byte{[]byte("reply-1")}); err == nil {
t.Fatal("send through closed peer succeeded")
}
if ready := registry.TransportFor("vpn-1", inbox); ready != nil {
t.Fatal("closed peer remained registered")
}
secondSender := &recordingFrameSender{}
go func() {
time.Sleep(25 * time.Millisecond)
registerFabricSessionPeerForTest(t, registry, secondSender, "vpn-1", 11)
}()
if err := transport.SendGatewayPacketBatch(context.Background(), [][]byte{[]byte("reply-2")}); err != nil {
t.Fatalf("send after peer rebind: %v", err)
}
if len(secondSender.frames) != 1 {
t.Fatalf("second sender frames = %d, want 1", len(secondSender.frames))
}
payload, err := DecodeFabricVPNPacketDataFrame(secondSender.frames[0])
if err != nil {
t.Fatalf("decode rebound reply: %v", err)
}
if string(payload.Packets[0]) != "reply-2" {
t.Fatalf("rebound payload = %+v", payload)
}
}
func TestFabricSessionPacketPeerTransportFailsFastWithoutPeer(t *testing.T) {
inbox := NewFabricPacketInbox(4)
registry := NewFabricSessionPacketPeerRegistry()
transport := &FabricSessionPacketPeerTransport{
Registry: registry,
Inbox: inbox,
VPNConnectionID: "vpn-1",
PeerWaitTimeout: 20 * time.Millisecond,
}
startedAt := time.Now()
err := transport.SendGatewayPacketBatch(context.Background(), [][]byte{[]byte("reply")})
if err == nil {
t.Fatal("send without peer succeeded")
}
if elapsed := time.Since(startedAt); elapsed > 250*time.Millisecond {
t.Fatalf("send without peer took %s, want fast failure", elapsed)
}
}
type recordingFrameSender struct {
err error
frames []fabricproto.Frame
}
func (s *recordingFrameSender) SendFrame(_ context.Context, frame fabricproto.Frame) error {
if s.err != nil {
return s.err
}
s.frames = append(s.frames, frame)
return nil
}
func registerFabricSessionPeerForTest(t *testing.T, registry *FabricSessionPacketPeerRegistry, sender FabricSessionFrameWriter, vpnConnectionID string, streamID uint64) {
t.Helper()
frame, err := NewFabricVPNSessionHelloFrame(FabricVPNPacketFrameInput{
StreamID: streamID,
VPNConnectionID: vpnConnectionID,
Direction: FabricDirectionClientToGateway,
})
if err != nil {
t.Fatalf("hello frame: %v", err)
}
handled, err := registry.RegisterFrame(context.Background(), sender, frame)
if err != nil || !handled {
t.Fatalf("register peer handled=%v err=%v", handled, err)
}
}
func TestFabricSessionPacketTransportReceiveReadsPumpFrames(t *testing.T) {
inbox := NewFabricPacketInbox(4)
receiver := memoryFabricSessionReceiver{
@@ -684,7 +999,7 @@ func TestFabricPacketInboxReceivesFabricSessionFrame(t *testing.T) {
}
}
func TestFabricVPNPacketDataFrameInfersInteractiveTCPControl(t *testing.T) {
func TestFabricVPNPacketDataFrameKeepsExplicitBulkForTCPControlContents(t *testing.T) {
packet := testIPv4TCPPacket([4]byte{192, 168, 200, 95}, [4]byte{10, 77, 0, 2}, 3389, 57032)
packet[33] = 0x12
frame, err := NewFabricVPNPacketDataFrame(FabricVPNPacketFrameInput{
@@ -698,12 +1013,13 @@ func TestFabricVPNPacketDataFrameInfersInteractiveTCPControl(t *testing.T) {
if err != nil {
t.Fatalf("new fabric vpn frame: %v", err)
}
if frame.TrafficClass != fabricproto.TrafficClassInteractive {
t.Fatalf("traffic class = %v, want interactive", frame.TrafficClass)
if frame.TrafficClass != fabricproto.TrafficClassBulk {
t.Fatalf("traffic class = %v, want opaque bulk", frame.TrafficClass)
}
}
func TestFabricPacketInboxPrioritizesGatewayTCPControlPackets(t *testing.T) {
t.Skip("retired: base VPN fabric channel preserves arrival order and no longer prioritizes TCP control packets")
inbox := NewFabricPacketInbox(4)
normal := testIPv4TCPPacket([4]byte{185, 16, 148, 89}, [4]byte{10, 77, 0, 2}, 443, 56000)
priority := testIPv4TCPPacket([4]byte{192, 168, 200, 95}, [4]byte{10, 77, 0, 2}, 3389, 57032)
@@ -726,6 +1042,7 @@ func TestFabricPacketInboxPrioritizesGatewayTCPControlPackets(t *testing.T) {
}
func TestFabricPacketInboxWaitsBrieflyForGatewayTCPControlPackets(t *testing.T) {
t.Skip("retired: base VPN fabric channel preserves arrival order and no longer waits for TCP control packets")
inbox := NewFabricPacketInbox(4)
normal := testIPv4TCPPacket([4]byte{185, 16, 148, 89}, [4]byte{10, 77, 0, 2}, 443, 56000)
priority := testIPv4TCPPacket([4]byte{192, 168, 200, 95}, [4]byte{10, 77, 0, 2}, 3389, 57032)
@@ -774,6 +1091,7 @@ func TestLocalPacketTransportUsesFabricInboxDirections(t *testing.T) {
}
func TestFabricFlowSchedulerKeepsReverseFiveTupleTogether(t *testing.T) {
t.Skip("retired: base VPN fabric channel uses opaque packet sharding instead of inspecting 5-tuples")
scheduler := NewFabricFlowScheduler(8, 8)
forward := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
reverse := testIPv4TCPPacket([4]byte{192, 168, 200, 95}, [4]byte{10, 77, 0, 2}, 3389, 51000)
@@ -826,6 +1144,18 @@ func TestFabricFlowSchedulerPrioritizesExplicitTrafficClass(t *testing.T) {
}
}
func TestFabricFlowSchedulerUsesOpaquePacketHashClassifier(t *testing.T) {
scheduler := NewFabricFlowScheduler(8, 0)
packet := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
batches := scheduler.ScheduleClientPacketsForConnection("vpn-1", [][]byte{packet})
if len(batches) != 1 {
t.Fatalf("batches = %d, want 1", len(batches))
}
if batches[0].Classifier != "opaque_packet_hash" || !strings.HasPrefix(batches[0].FlowID, "opaque:") {
t.Fatalf("scheduler should not expose protocol-derived flow keys: %+v", batches[0])
}
}
func TestFabricFlowSchedulerDropsWhenChannelQueueIsFull(t *testing.T) {
scheduler := NewFabricFlowScheduler(1, 1)
packetA := testIPv4TCPPacket([4]byte{10, 77, 0, 2}, [4]byte{192, 168, 200, 95}, 51000, 3389)
@@ -1032,7 +1362,7 @@ func TestFabricClientPacketIngressUsesLeasePreferredRouteBeforeConfigOrder(t *te
}
}
func TestFabricClientPacketIngressTriesAlternateRouteBeforeBackendFallback(t *testing.T) {
func TestFabricClientPacketIngressTriesAlternateRouteBeforeCompatFallback(t *testing.T) {
transport := &failoverProductionTransport{failNextHop: "relay-bad"}
ingress := &FabricClientPacketIngress{
ForwardTransport: transport,
@@ -2617,10 +2947,10 @@ func TestFabricClientPacketIngressBoundedLoadReportsPerChannelDrops(t *testing.T
func TestFabricClientPacketIngressUsesLocalGatewayShortcutWithoutRoute(t *testing.T) {
inbox := NewFabricPacketInbox(4)
ingress := &FabricClientPacketIngress{
Inbox: inbox,
ClusterID: "cluster-1",
LocalNodeID: "entry-1",
AllowLegacyLocalGatewayFallback: true,
Inbox: inbox,
ClusterID: "cluster-1",
LocalNodeID: "entry-1",
AllowLocalGatewayBypass: true,
LocalGateway: func(vpnConnectionID string) bool {
return vpnConnectionID == "vpn-1"
},
@@ -2642,10 +2972,10 @@ func TestFabricClientPacketIngressUsesLocalGatewayShortcutWithoutRoute(t *testin
func TestFabricClientPacketIngressReceivesLocalGatewayReplyWithoutRoute(t *testing.T) {
inbox := NewFabricPacketInbox(4)
ingress := &FabricClientPacketIngress{
Inbox: inbox,
ClusterID: "cluster-1",
LocalNodeID: "entry-1",
AllowLegacyLocalGatewayFallback: true,
Inbox: inbox,
ClusterID: "cluster-1",
LocalNodeID: "entry-1",
AllowLocalGatewayBypass: true,
LocalGateway: func(vpnConnectionID string) bool {
return vpnConnectionID == "vpn-1"
},
@@ -2705,6 +3035,24 @@ func packetSourcePort(packet []byte) uint16 {
return uint16(packet[20])<<8 | uint16(packet[21])
}
func testDNSIPv4PacketForFabricRuntime() []byte {
packet := make([]byte, 28)
packet[0] = 0x45
packet[2] = 0
packet[3] = byte(len(packet))
packet[8] = 64
packet[9] = 17
copy(packet[12:16], []byte{10, 77, 0, 2})
copy(packet[16:20], []byte{1, 1, 1, 1})
packet[20] = 0xc0
packet[21] = 0x00
packet[22] = 0x00
packet[23] = 0x35
packet[24] = 0
packet[25] = 8
return packet
}
func testFlowChannelID(vpnConnectionID string, packet []byte, shardCount int) string {
return fabricFlowChannelID(vpnConnectionID, packetShard(packet, shardCount))
}
@@ -18,6 +18,7 @@ type Gateway struct {
Transport PacketTransport
ClusterID string
VPNConnectionID string
ServiceTunnel FabricServiceTunnel
InterfaceName string
AddressCIDR string
RouteCIDR string
@@ -73,20 +74,6 @@ type packetTransportCloser interface {
Close() error
}
type BackendPacketTransport struct {
API *client.Client
ClusterID string
VPNConnectionID string
}
func (t BackendPacketTransport) SendGatewayPacketBatch(ctx context.Context, packets [][]byte) error {
return t.API.SendVPNGatewayPacketBatch(ctx, t.ClusterID, t.VPNConnectionID, packets)
}
func (t BackendPacketTransport) ReceiveGatewayPacketBatch(ctx context.Context, timeout time.Duration) ([][]byte, error) {
return t.API.ReceiveVPNGatewayPacketBatch(ctx, t.ClusterID, t.VPNConnectionID, timeout)
}
func (g *Gateway) EnsureStarted(ctx context.Context) error {
g.mu.Lock()
if g.running {
@@ -120,7 +107,7 @@ func (g *Gateway) EnsureStarted(ctx context.Context) error {
go func() {
if err := g.run(runCtx, tun); err != nil && runCtx.Err() == nil {
log.Printf("vpn gateway runtime stopped: vpn_connection_id=%s error=%v", g.VPNConnectionID, err)
log.Printf("vpn gateway runtime stopped: tunnel_id=%s error=%v", g.tunnelID(), err)
g.setStopped(err)
return
}
@@ -152,7 +139,8 @@ func (g *Gateway) Status() (bool, string) {
func (g *Gateway) IsReadyForConnection(vpnConnectionID string) bool {
g.mu.Lock()
defer g.mu.Unlock()
return g.running && g.VPNConnectionID == vpnConnectionID && vpnConnectionID != ""
tunnelID := g.tunnelIDLocked()
return g.running && (g.VPNConnectionID == vpnConnectionID || tunnelID == vpnConnectionID) && vpnConnectionID != ""
}
func (g *Gateway) Snapshot() map[string]any {
@@ -169,8 +157,14 @@ func (g *Gateway) Snapshot() map[string]any {
out := map[string]any{
"running": running,
"service_role": "ipv4-egress",
"service_class": "vpn_packets",
"tunnel_id": g.ServiceTunnel.TunnelID,
"pool_id": g.ServiceTunnel.PoolID,
"service_id": g.ServiceTunnel.ServiceID,
"local_service_id": g.ServiceTunnel.LocalServiceID,
"remote_service_id": g.ServiceTunnel.RemoteServiceID,
"service_kind": g.ServiceTunnel.ServiceKind,
"service_role": firstNonEmptyTunnelString(g.ServiceTunnel.ServiceRole, DefaultFabricTunnelRole),
"service_class": firstNonEmptyTunnelString(g.ServiceTunnel.ServiceClass, DefaultFabricTunnelClass),
"adapter_contract": "fabric_channel_to_ipv4_nat",
"transport": g.transportName(),
"poll_timeout_ms": g.PollTimeout.Milliseconds(),
@@ -196,6 +190,7 @@ func (g *Gateway) Snapshot() map[string]any {
if !lastRuntimeActivityAt.IsZero() {
out["last_runtime_activity_at"] = lastRuntimeActivityAt.UTC().Format(time.RFC3339Nano)
}
out["service_tunnel"] = g.ServiceTunnel.Snapshot()
if platform := gatewayPlatformSnapshot(g.InterfaceName, g.RouteCIDR); len(platform) > 0 {
out["platform"] = platform
}
@@ -216,9 +211,7 @@ func (g *Gateway) transportName() string {
case *LocalPacketTransport:
return "local_fabric_inbox"
case *AdaptivePacketTransport:
return "adaptive_fabric_backend"
case BackendPacketTransport:
return "backend_http_packet_relay"
return "adaptive_fabric"
default:
if g.Transport == nil {
return "none"
@@ -237,10 +230,14 @@ func (g *Gateway) setStopped(err error) {
func (g *Gateway) normalize() error {
if g.Transport == nil {
return fmt.Errorf("fabric packet transport is required; backend packet relay fallback is disabled")
return fmt.Errorf("fabric packet transport is required")
}
g.ServiceTunnel = NormalizeServiceTunnel(g.ServiceTunnel, g.VPNConnectionID)
if g.VPNConnectionID == "" {
g.VPNConnectionID = g.ServiceTunnel.TunnelID
}
if g.ClusterID == "" || g.VPNConnectionID == "" {
return fmt.Errorf("cluster id and vpn connection id are required")
return fmt.Errorf("cluster id and tunnel id are required")
}
if g.InterfaceName == "" {
g.InterfaceName = "rapvpn0"
@@ -257,6 +254,19 @@ func (g *Gateway) normalize() error {
return nil
}
func (g *Gateway) tunnelIDLocked() string {
return firstNonEmptyTunnelString(g.ServiceTunnel.TunnelID, g.VPNConnectionID)
}
func (g *Gateway) tunnelID() string {
if g == nil {
return ""
}
g.mu.Lock()
defer g.mu.Unlock()
return g.tunnelIDLocked()
}
func (g *Gateway) run(ctx context.Context, tun readWriteCloser) error {
defer tun.Close()
if closer, ok := g.Transport.(packetTransportCloser); ok {
@@ -279,11 +289,10 @@ func (g *Gateway) run(ctx context.Context, tun readWriteCloser) error {
}
func (g *Gateway) copyGatewayToClient(ctx context.Context, tun io.Reader) error {
priorityPackets := make(chan []byte, 1024)
packets := make(chan []byte, 32768)
errCh := make(chan error, 1)
go func() {
errCh <- g.uploadGatewayPackets(ctx, priorityPackets, packets)
errCh <- g.uploadGatewayPackets(ctx, nil, packets)
}()
buffer := make([]byte, 65535)
@@ -307,25 +316,16 @@ func (g *Gateway) copyGatewayToClient(ctx context.Context, tun io.Reader) error
packet := append([]byte(nil), buffer[:n]...)
normalizeIPv4PacketChecksums(packet)
g.recordTunRead(packet)
if isTCPControlPacket(packet) {
select {
case priorityPackets <- packet:
default:
g.uploadQueueDrops.Add(1)
log.Printf("vpn gateway priority packet upload queue full; dropping packet: vpn_connection_id=%s", g.VPNConnectionID)
}
continue
}
select {
case packets <- packet:
default:
g.uploadQueueDrops.Add(1)
log.Printf("vpn gateway packet upload queue full; dropping packet: vpn_connection_id=%s", g.VPNConnectionID)
log.Printf("vpn gateway packet upload queue full; dropping packet: tunnel_id=%s", g.tunnelID())
}
}
}
func (g *Gateway) uploadGatewayPackets(ctx context.Context, priorityPackets <-chan []byte, packets <-chan []byte) error {
func (g *Gateway) uploadGatewayPackets(ctx context.Context, _ <-chan []byte, packets <-chan []byte) error {
batch := make([][]byte, 0, vpnGatewayBatchMaxPackets)
batchBytes := 0
timer := time.NewTimer(time.Hour)
@@ -341,7 +341,7 @@ func (g *Gateway) uploadGatewayPackets(ctx context.Context, priorityPackets <-ch
byteCount := packetBytesTotal(batch)
if err := g.Transport.SendGatewayPacketBatch(ctx, batch); err != nil {
g.uploadErrors.Add(1)
log.Printf("vpn gateway packet batch upload failed: vpn_connection_id=%s packets=%d error=%v", g.VPNConnectionID, len(batch), err)
log.Printf("vpn gateway packet batch upload failed: tunnel_id=%s packets=%d error=%v", g.tunnelID(), len(batch), err)
} else {
g.recordGatewayToClientBatch(packetCount, byteCount, batch[0])
}
@@ -366,50 +366,6 @@ func (g *Gateway) uploadGatewayPackets(ctx context.Context, priorityPackets <-ch
batchBytes += packetFrameSize
return true
}
flushPriority := func(packet []byte) {
pendingBatch := batch
pendingBatchBytes := batchBytes
batch = make([][]byte, 0, vpnGatewayBatchMaxPackets)
batchBytes = 0
if !addPacket(packet) {
batch = pendingBatch
batchBytes = pendingBatchBytes
return
}
deadline := time.Now().Add(vpnGatewayPriorityBatchWait)
for len(batch) < vpnGatewayBatchMaxPackets && batchBytes < vpnGatewayBatchMaxBytes {
wait := time.Until(deadline)
if wait <= 0 {
break
}
timer := time.NewTimer(wait)
select {
case next := <-priorityPackets:
if !timer.Stop() {
select {
case <-timer.C:
default:
}
}
if !addPacket(next) {
flush()
_ = addPacket(next)
}
case <-timer.C:
flush()
return
}
}
flush()
if len(pendingBatch) > 0 {
batch = pendingBatch
batchBytes = pendingBatchBytes
if !timerActive {
timer.Reset(vpnGatewayBatchFlushTimeout)
timerActive = true
}
}
}
for {
if len(batch) == 0 && timerActive {
if !timer.Stop() {
@@ -421,17 +377,9 @@ func (g *Gateway) uploadGatewayPackets(ctx context.Context, priorityPackets <-ch
timerActive = false
}
select {
case packet := <-priorityPackets:
flushPriority(packet)
continue
default:
}
select {
case <-ctx.Done():
flush()
return ctx.Err()
case packet := <-priorityPackets:
flushPriority(packet)
case packet := <-packets:
if !addPacket(packet) {
continue
@@ -451,23 +399,11 @@ func (g *Gateway) uploadGatewayPackets(ctx context.Context, priorityPackets <-ch
}
}
func isTCPControlPacket(packet []byte) bool {
if len(packet) < 20 || packet[0]>>4 != 4 {
return false
}
ihl := int(packet[0]&0x0f) * 4
if ihl < 20 || len(packet) < ihl+20 || packet[9] != 6 {
return false
}
flags := packet[ihl+13]
return flags&0x17 != 0
}
func (g *Gateway) copyClientToGateway(ctx context.Context, tun io.Writer) error {
for {
packets, err := g.Transport.ReceiveGatewayPacketBatch(ctx, g.PollTimeout)
if err != nil {
log.Printf("vpn gateway packet download failed: vpn_connection_id=%s error=%v", g.VPNConnectionID, err)
log.Printf("vpn gateway packet download failed: tunnel_id=%s error=%v", g.tunnelID(), err)
select {
case <-ctx.Done():
return ctx.Err()
@@ -501,8 +437,8 @@ func (g *Gateway) recordClientToGatewayBatch(packetCount int, byteCount int, fir
g.mu.Unlock()
if next <= 5 {
log.Printf(
"vpn gateway client_to_gateway batch received: vpn_connection_id=%s batch=%d packets=%d bytes=%d first=%s",
g.VPNConnectionID,
"vpn gateway client_to_gateway batch received: tunnel_id=%s batch=%d packets=%d bytes=%d first=%s",
g.tunnelID(),
next,
packetCount,
byteCount,
@@ -522,8 +458,8 @@ func (g *Gateway) recordGatewayToClientBatch(packetCount int, byteCount int, fir
g.mu.Unlock()
if next <= 5 {
log.Printf(
"vpn gateway gateway_to_client batch uploaded: vpn_connection_id=%s batch=%d packets=%d bytes=%d first=%s",
g.VPNConnectionID,
"vpn gateway gateway_to_client batch uploaded: tunnel_id=%s batch=%d packets=%d bytes=%d first=%s",
g.tunnelID(),
next,
packetCount,
byteCount,
@@ -536,7 +472,7 @@ func (g *Gateway) recordTunWrite(packet []byte) {
next := g.tunWritePackets.Add(1)
g.tunWriteBytes.Add(uint64(len(packet)))
if next <= 5 {
log.Printf("vpn gateway packet written to tun: vpn_connection_id=%s packet=%d bytes=%d summary=%s", g.VPNConnectionID, next, len(packet), summarizePacket(packet))
log.Printf("vpn gateway packet written to tun: tunnel_id=%s packet=%d bytes=%d summary=%s", g.tunnelID(), next, len(packet), summarizePacket(packet))
}
}
@@ -544,7 +480,7 @@ func (g *Gateway) recordTunRead(packet []byte) {
next := g.tunReadPackets.Add(1)
g.tunReadBytes.Add(uint64(len(packet)))
if next <= 5 {
log.Printf("vpn gateway packet read from tun: vpn_connection_id=%s packet=%d bytes=%d summary=%s", g.VPNConnectionID, next, len(packet), summarizePacket(packet))
log.Printf("vpn gateway packet read from tun: tunnel_id=%s packet=%d bytes=%d summary=%s", g.tunnelID(), next, len(packet), summarizePacket(packet))
}
}
@@ -95,7 +95,7 @@ func TestGatewayRunClosesPacketTransportOnRuntimeError(t *testing.T) {
}
}
func TestGatewayNormalizeRejectsBackendPacketRelayFallback(t *testing.T) {
func TestGatewayNormalizeRequiresFabricPacketTransport(t *testing.T) {
gateway := &Gateway{
API: nil,
ClusterID: "cluster-1",
@@ -106,7 +106,7 @@ func TestGatewayNormalizeRejectsBackendPacketRelayFallback(t *testing.T) {
if err == nil {
t.Fatal("normalize succeeded without a fabric packet transport")
}
if got, want := err.Error(), "fabric packet transport is required; backend packet relay fallback is disabled"; got != want {
if got, want := err.Error(), "fabric packet transport is required"; got != want {
t.Fatalf("normalize error = %q, want %q", got, want)
}
}
@@ -120,6 +120,7 @@ func TestGatewaySnapshotReportsIPv4EgressServiceAdapter(t *testing.T) {
}
func TestGatewayUploadPrioritizesTCPControlPackets(t *testing.T) {
t.Skip("retired: base VPN gateway uploads opaque packet batches without TCP control prioritization")
transport := &recordingGatewayTransport{}
gateway := &Gateway{Transport: transport, VPNConnectionID: "vpn-1"}
priorityPackets := make(chan []byte, 1)
@@ -160,6 +161,7 @@ func TestGatewayUploadPrioritizesTCPControlPackets(t *testing.T) {
}
func TestGatewayUploadPreemptsPendingNormalBatchForTCPControlPackets(t *testing.T) {
t.Skip("retired: base VPN gateway preserves packet batch order instead of preempting by TCP flags")
transport := &recordingGatewayTransport{}
gateway := &Gateway{Transport: transport, VPNConnectionID: "vpn-1"}
priorityPackets := make(chan []byte, 1)
@@ -201,6 +203,7 @@ func TestGatewayUploadPreemptsPendingNormalBatchForTCPControlPackets(t *testing.
}
func TestGatewayUploadMicroBatchesTCPControlPackets(t *testing.T) {
t.Skip("retired: base VPN gateway no longer creates protocol-specific TCP control microbatches")
transport := &recordingGatewayTransport{}
gateway := &Gateway{Transport: transport, VPNConnectionID: "vpn-1"}
priorityPackets := make(chan []byte, 2)
@@ -239,18 +242,3 @@ func TestGatewayUploadMicroBatchesTCPControlPackets(t *testing.T) {
}
}
}
func TestIsTCPControlPacket(t *testing.T) {
packet := testIPv4TCPPacket([4]byte{192, 168, 200, 95}, [4]byte{10, 77, 0, 2}, 3389, 51000)
if isTCPControlPacket(packet) {
t.Fatal("packet without control flags was classified as control")
}
packet[33] = 0x12
if !isTCPControlPacket(packet) {
t.Fatal("tcp syn-ack was not classified as control")
}
packet[9] = 17
if isTCPControlPacket(packet) {
t.Fatal("udp packet was classified as tcp control")
}
}
@@ -0,0 +1,208 @@
package vpnruntime
import (
"fmt"
"sort"
"sync"
"time"
)
const (
FabricServiceStreamRegistrySchemaVersion = "rap.fabric_service_stream_registry.v1"
FabricServiceStreamStateOpen = "open"
FabricServiceStreamStateClosed = "closed"
FabricServiceStreamStateReset = "reset"
)
type FabricServiceStream struct {
TunnelID string `json:"tunnel_id"`
ServiceID string `json:"service_id"`
StreamID uint64 `json:"stream_id"`
TrafficClass string `json:"traffic_class"`
Direction string `json:"direction,omitempty"`
State string `json:"state"`
ServiceTunnel FabricServiceTunnel `json:"service_tunnel"`
OpenedAt time.Time `json:"opened_at"`
UpdatedAt time.Time `json:"updated_at"`
Metadata map[string]string `json:"metadata,omitempty"`
}
type FabricServiceStreamRegistry struct {
mu sync.RWMutex
streams map[string]FabricServiceStream
}
func NewFabricServiceStreamRegistry() *FabricServiceStreamRegistry {
return &FabricServiceStreamRegistry{streams: map[string]FabricServiceStream{}}
}
func (r *FabricServiceStreamRegistry) Register(stream FabricServiceStream) FabricServiceStream {
if r == nil {
return FabricServiceStream{}
}
now := time.Now().UTC()
stream.ServiceTunnel = NormalizeServiceTunnel(stream.ServiceTunnel, stream.TunnelID)
stream.TunnelID = firstNonEmptyTunnelString(stream.TunnelID, stream.ServiceTunnel.TunnelID)
stream.ServiceID = firstNonEmptyTunnelString(stream.ServiceID, stream.ServiceTunnel.ServiceID)
stream.TrafficClass = normalizeFabricTrafficClass(stream.TrafficClass)
if stream.State == "" {
stream.State = FabricServiceStreamStateOpen
}
if stream.OpenedAt.IsZero() {
stream.OpenedAt = now
}
stream.UpdatedAt = now
r.mu.Lock()
defer r.mu.Unlock()
if r.streams == nil {
r.streams = map[string]FabricServiceStream{}
}
if existing, ok := r.streams[serviceStreamKey(stream.TunnelID, stream.StreamID)]; ok {
if !existing.OpenedAt.IsZero() {
stream.OpenedAt = existing.OpenedAt
}
}
r.streams[serviceStreamKey(stream.TunnelID, stream.StreamID)] = stream
return stream
}
func (r *FabricServiceStreamRegistry) MarkClosed(tunnelID string, streamID uint64) {
r.markState(tunnelID, streamID, FabricServiceStreamStateClosed)
}
func (r *FabricServiceStreamRegistry) MarkReset(tunnelID string, streamID uint64) {
r.markState(tunnelID, streamID, FabricServiceStreamStateReset)
}
func (r *FabricServiceStreamRegistry) StreamsForTunnel(tunnelID string) []FabricServiceStream {
if r == nil || tunnelID == "" {
return nil
}
r.mu.RLock()
defer r.mu.RUnlock()
out := make([]FabricServiceStream, 0)
for _, stream := range r.streams {
if stream.TunnelID == tunnelID {
out = append(out, cloneFabricServiceStream(stream))
}
}
sort.Slice(out, func(i, j int) bool { return out[i].StreamID < out[j].StreamID })
return out
}
func (r *FabricServiceStreamRegistry) Snapshot() map[string]any {
if r == nil {
return map[string]any{"schema_version": FabricServiceStreamRegistrySchemaVersion, "stream_count": 0}
}
r.mu.RLock()
defer r.mu.RUnlock()
items := make([]map[string]any, 0, len(r.streams))
openCount := 0
for _, stream := range r.streams {
if stream.State == FabricServiceStreamStateOpen {
openCount++
}
item := map[string]any{
"tunnel_id": stream.TunnelID,
"service_id": stream.ServiceID,
"stream_id": stream.StreamID,
"traffic_class": stream.TrafficClass,
"direction": stream.Direction,
"state": stream.State,
"service_tunnel": stream.ServiceTunnel.Snapshot(),
}
if !stream.OpenedAt.IsZero() {
item["opened_at"] = stream.OpenedAt.Format(time.RFC3339Nano)
}
if !stream.UpdatedAt.IsZero() {
item["updated_at"] = stream.UpdatedAt.Format(time.RFC3339Nano)
}
if len(stream.Metadata) > 0 {
item["metadata"] = cloneStringMap(stream.Metadata)
}
items = append(items, item)
}
sort.Slice(items, func(i, j int) bool {
left, _ := items[i]["stream_id"].(uint64)
right, _ := items[j]["stream_id"].(uint64)
return left < right
})
return map[string]any{
"schema_version": FabricServiceStreamRegistrySchemaVersion,
"stream_count": len(items),
"open_count": openCount,
"streams": items,
}
}
func (r *FabricServiceStreamRegistry) markState(tunnelID string, streamID uint64, state string) {
if r == nil || tunnelID == "" || streamID == 0 {
return
}
r.mu.Lock()
defer r.mu.Unlock()
key := serviceStreamKey(tunnelID, streamID)
stream, ok := r.streams[key]
if !ok {
return
}
stream.State = state
stream.UpdatedAt = time.Now().UTC()
r.streams[key] = stream
}
func serviceStreamKey(tunnelID string, streamID uint64) string {
return fmt.Sprintf("%s\x00%d", tunnelID, streamID)
}
func cloneFabricServiceStream(stream FabricServiceStream) FabricServiceStream {
stream.Metadata = cloneStringMap(stream.Metadata)
return stream
}
func serviceStreamsSnapshotItems(streams []FabricServiceStream) []map[string]any {
if len(streams) == 0 {
return nil
}
items := make([]map[string]any, 0, len(streams))
for _, stream := range streams {
item := map[string]any{
"tunnel_id": stream.TunnelID,
"service_id": stream.ServiceID,
"stream_id": stream.StreamID,
"traffic_class": stream.TrafficClass,
"direction": stream.Direction,
"state": stream.State,
"service_tunnel": stream.ServiceTunnel.Snapshot(),
}
if !stream.OpenedAt.IsZero() {
item["opened_at"] = stream.OpenedAt.Format(time.RFC3339Nano)
}
if !stream.UpdatedAt.IsZero() {
item["updated_at"] = stream.UpdatedAt.Format(time.RFC3339Nano)
}
if len(stream.Metadata) > 0 {
item["metadata"] = cloneStringMap(stream.Metadata)
}
items = append(items, item)
}
sort.Slice(items, func(i, j int) bool {
left, _ := items[i]["stream_id"].(uint64)
right, _ := items[j]["stream_id"].(uint64)
return left < right
})
return items
}
func cloneStringMap(values map[string]string) map[string]string {
if len(values) == 0 {
return nil
}
out := make(map[string]string, len(values))
for key, value := range values {
out[key] = value
}
return out
}
@@ -0,0 +1,44 @@
package vpnruntime
import "testing"
func TestFabricServiceStreamRegistryTracksTunnelScopedStreams(t *testing.T) {
registry := NewFabricServiceStreamRegistry()
stream := registry.Register(FabricServiceStream{
TunnelID: "tunnel-1",
ServiceID: "svc-1",
StreamID: 42,
TrafficClass: FabricServiceTrafficInteractive,
Direction: FabricDirectionClientToGateway,
ServiceTunnel: FabricServiceTunnel{
TunnelID: "tunnel-1",
PoolID: "pool-vpn",
ServiceID: "svc-1",
ServiceKind: "ipv4-tunnel",
},
Metadata: map[string]string{"adapter": "vpn"},
})
if stream.State != FabricServiceStreamStateOpen {
t.Fatalf("stream state = %q, want open", stream.State)
}
if stream.ServiceTunnel.TransportOwner != DefaultFabricTransportOwner {
t.Fatalf("service tunnel should remain fabric-owned: %+v", stream.ServiceTunnel)
}
streams := registry.StreamsForTunnel("tunnel-1")
if len(streams) != 1 || streams[0].StreamID != 42 || streams[0].ServiceID != "svc-1" {
t.Fatalf("streams for tunnel = %+v", streams)
}
registry.MarkClosed("tunnel-1", 42)
streams = registry.StreamsForTunnel("tunnel-1")
if len(streams) != 1 || streams[0].State != FabricServiceStreamStateClosed {
t.Fatalf("closed stream not tracked: %+v", streams)
}
snapshot := registry.Snapshot()
if snapshot["schema_version"] != FabricServiceStreamRegistrySchemaVersion ||
snapshot["stream_count"] != 1 ||
snapshot["open_count"] != 0 {
t.Fatalf("unexpected registry snapshot: %+v", snapshot)
}
}
@@ -0,0 +1,179 @@
package vpnruntime
import "strings"
const (
DefaultFabricTunnelPoolID = "ipv4-egress"
DefaultFabricTunnelServiceKind = "ipv4-tunnel"
DefaultFabricTunnelClass = "vpn_packets"
DefaultFabricTunnelRole = "ipv4-egress"
DefaultFabricTunnelDataPlane = "fabric_quic_streams"
DefaultFabricTransportOwner = "fabric_farm"
DefaultFabricRouteVisibility = "opaque_to_service"
FabricServiceTunnelSchemaVersion = "rap.fabric_service_tunnel.v1"
FabricServiceTrafficControl = "control"
FabricServiceTrafficDNS = "dns"
FabricServiceTrafficInteractive = "interactive"
FabricServiceTrafficReliable = "reliable"
FabricServiceTrafficBulk = "bulk"
FabricServiceTrafficDroppable = "droppable"
DefaultFabricServiceStreamShards = 8
)
type FabricServiceTunnel struct {
TunnelID string `json:"tunnel_id"`
PoolID string `json:"pool_id"`
ServiceID string `json:"service_id"`
LocalServiceID string `json:"local_service_id"`
RemoteServiceID string `json:"remote_service_id"`
ServiceKind string `json:"service_kind"`
ServiceClass string `json:"service_class"`
ServiceRole string `json:"service_role"`
RouteLeaseID string `json:"route_lease_id,omitempty"`
RouteGeneration string `json:"route_generation,omitempty"`
DataPlane string `json:"data_plane,omitempty"`
TransportOwner string `json:"transport_owner,omitempty"`
RouteVisibility string `json:"route_visibility,omitempty"`
TrafficClasses []string `json:"traffic_classes,omitempty"`
StreamShards int `json:"stream_shards,omitempty"`
}
type FabricServiceTunnelDefaults struct {
PoolID string
ServiceKind string
ServiceClass string
ServiceRole string
DataPlane string
TransportOwner string
RouteVisibility string
TrafficClasses []string
StreamShards int
}
func NormalizeServiceTunnel(tunnel FabricServiceTunnel, fallbackID string) FabricServiceTunnel {
return NormalizeServiceTunnelWithDefaults(tunnel, fallbackID, DefaultVPNServiceTunnelDefaults())
}
func NormalizeServiceTunnelWithDefaults(tunnel FabricServiceTunnel, fallbackID string, defaults FabricServiceTunnelDefaults) FabricServiceTunnel {
defaults = normalizeServiceTunnelDefaults(defaults)
tunnel.TunnelID = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.TunnelID, fallbackID))
tunnel.PoolID = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.PoolID, defaults.PoolID))
tunnel.ServiceID = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.ServiceID, "svc-"+tunnel.TunnelID))
tunnel.LocalServiceID = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.LocalServiceID, "svc-local-"+tunnel.TunnelID))
tunnel.RemoteServiceID = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.RemoteServiceID, "svc-remote-"+tunnel.TunnelID))
tunnel.ServiceKind = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.ServiceKind, defaults.ServiceKind))
tunnel.ServiceClass = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.ServiceClass, defaults.ServiceClass))
tunnel.ServiceRole = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.ServiceRole, defaults.ServiceRole))
tunnel.DataPlane = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.DataPlane, defaults.DataPlane))
tunnel.TransportOwner = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.TransportOwner, defaults.TransportOwner))
tunnel.RouteVisibility = strings.TrimSpace(firstNonEmptyTunnelString(tunnel.RouteVisibility, defaults.RouteVisibility))
tunnel.TrafficClasses = normalizeTunnelTrafficClasses(tunnel.TrafficClasses, defaults.TrafficClasses)
if tunnel.StreamShards <= 0 {
tunnel.StreamShards = defaults.StreamShards
}
return tunnel
}
func (t FabricServiceTunnel) Snapshot() map[string]any {
t = NormalizeServiceTunnelWithDefaults(t, t.TunnelID, FabricServiceTunnelDefaults{
PoolID: t.PoolID,
ServiceKind: t.ServiceKind,
ServiceClass: t.ServiceClass,
ServiceRole: t.ServiceRole,
DataPlane: t.DataPlane,
TransportOwner: t.TransportOwner,
RouteVisibility: t.RouteVisibility,
TrafficClasses: t.TrafficClasses,
StreamShards: t.StreamShards,
})
return map[string]any{
"schema_version": FabricServiceTunnelSchemaVersion,
"tunnel_id": t.TunnelID,
"pool_id": t.PoolID,
"service_id": t.ServiceID,
"local_service_id": t.LocalServiceID,
"remote_service_id": t.RemoteServiceID,
"service_kind": t.ServiceKind,
"service_class": t.ServiceClass,
"service_role": t.ServiceRole,
"route_lease_id": t.RouteLeaseID,
"route_generation": t.RouteGeneration,
"data_plane": t.DataPlane,
"transport_owner": t.TransportOwner,
"route_visibility": t.RouteVisibility,
"traffic_classes": append([]string(nil), t.TrafficClasses...),
"stream_shards": t.StreamShards,
"selected_node_known": false,
}
}
func DefaultVPNServiceTunnelDefaults() FabricServiceTunnelDefaults {
return FabricServiceTunnelDefaults{
PoolID: DefaultFabricTunnelPoolID,
ServiceKind: DefaultFabricTunnelServiceKind,
ServiceClass: DefaultFabricTunnelClass,
ServiceRole: DefaultFabricTunnelRole,
DataPlane: DefaultFabricTunnelDataPlane,
TransportOwner: DefaultFabricTransportOwner,
RouteVisibility: DefaultFabricRouteVisibility,
TrafficClasses: []string{
FabricServiceTrafficControl,
FabricServiceTrafficDNS,
FabricServiceTrafficInteractive,
FabricServiceTrafficReliable,
FabricServiceTrafficBulk,
FabricServiceTrafficDroppable,
},
StreamShards: DefaultFabricServiceStreamShards,
}
}
func normalizeServiceTunnelDefaults(defaults FabricServiceTunnelDefaults) FabricServiceTunnelDefaults {
fallback := DefaultVPNServiceTunnelDefaults()
defaults.PoolID = firstNonEmptyTunnelString(defaults.PoolID, fallback.PoolID)
defaults.ServiceKind = firstNonEmptyTunnelString(defaults.ServiceKind, fallback.ServiceKind)
defaults.ServiceClass = firstNonEmptyTunnelString(defaults.ServiceClass, fallback.ServiceClass)
defaults.ServiceRole = firstNonEmptyTunnelString(defaults.ServiceRole, fallback.ServiceRole)
defaults.DataPlane = firstNonEmptyTunnelString(defaults.DataPlane, fallback.DataPlane)
defaults.TransportOwner = firstNonEmptyTunnelString(defaults.TransportOwner, fallback.TransportOwner)
defaults.RouteVisibility = firstNonEmptyTunnelString(defaults.RouteVisibility, fallback.RouteVisibility)
defaults.TrafficClasses = normalizeTunnelTrafficClasses(defaults.TrafficClasses, fallback.TrafficClasses)
if defaults.StreamShards <= 0 {
defaults.StreamShards = fallback.StreamShards
}
return defaults
}
func normalizeTunnelTrafficClasses(values []string, fallback []string) []string {
if len(values) == 0 {
return append([]string(nil), fallback...)
}
out := make([]string, 0, len(values))
seen := map[string]struct{}{}
for _, value := range values {
value = strings.TrimSpace(value)
if value == "" {
continue
}
if _, ok := seen[value]; ok {
continue
}
seen[value] = struct{}{}
out = append(out, value)
}
if len(out) == 0 {
return append([]string(nil), fallback...)
}
return out
}
func firstNonEmptyTunnelString(values ...string) string {
for _, value := range values {
if trimmed := strings.TrimSpace(value); trimmed != "" {
return trimmed
}
}
return ""
}
@@ -0,0 +1,46 @@
package vpnruntime
import "testing"
func TestNormalizeServiceTunnelKeepsVPNAsProfileNotTransportRule(t *testing.T) {
tunnel := NormalizeServiceTunnel(FabricServiceTunnel{}, "vpn-tunnel-1")
if tunnel.TunnelID != "vpn-tunnel-1" {
t.Fatalf("tunnel id = %q", tunnel.TunnelID)
}
if tunnel.ServiceKind != DefaultFabricTunnelServiceKind || tunnel.ServiceClass != DefaultFabricTunnelClass {
t.Fatalf("vpn defaults not applied: %+v", tunnel)
}
if tunnel.TransportOwner != DefaultFabricTransportOwner || tunnel.RouteVisibility != DefaultFabricRouteVisibility {
t.Fatalf("transport ownership defaults not applied: %+v", tunnel)
}
if tunnel.DataPlane != DefaultFabricTunnelDataPlane || tunnel.StreamShards != DefaultFabricServiceStreamShards {
t.Fatalf("data plane defaults not applied: %+v", tunnel)
}
if len(tunnel.TrafficClasses) < 5 {
t.Fatalf("traffic classes too small: %+v", tunnel.TrafficClasses)
}
}
func TestNormalizeServiceTunnelSupportsNonVPNService(t *testing.T) {
tunnel := NormalizeServiceTunnelWithDefaults(FabricServiceTunnel{}, "rdp-tunnel-1", FabricServiceTunnelDefaults{
PoolID: "desktop-exit",
ServiceKind: "rdp-client",
ServiceClass: "remote_desktop",
ServiceRole: "desktop-egress",
TrafficClasses: []string{
FabricServiceTrafficControl,
FabricServiceTrafficInteractive,
FabricServiceTrafficBulk,
},
StreamShards: 8,
})
if tunnel.TunnelID != "rdp-tunnel-1" || tunnel.PoolID != "desktop-exit" || tunnel.ServiceKind != "rdp-client" {
t.Fatalf("non-vpn tunnel defaults not applied: %+v", tunnel)
}
if tunnel.ServiceClass != "remote_desktop" || tunnel.ServiceRole != "desktop-egress" {
t.Fatalf("non-vpn service identity not applied: %+v", tunnel)
}
if tunnel.StreamShards != 8 || len(tunnel.TrafficClasses) != 3 {
t.Fatalf("non-vpn stream policy not applied: %+v", tunnel)
}
}
@@ -19,8 +19,8 @@ const (
iffNoPI = 0x1000
tunSetIFF = 0x400454ca
ifNameSize = 16
gatewayTunMTU = "1000"
gatewayTCPMSS = "900"
gatewayTunMTU = "1280"
gatewayTCPMSS = "1240"
)
type tunDevice struct {
@@ -77,14 +77,10 @@ func (d AdminRuntimeDispatcher) HandleFabricRequest(ctx context.Context, request
func allowedAdminRuntimeScope(scope string, serviceClass string) bool {
switch serviceClass {
case "platform_admin":
return scope == "platform"
case "cluster_admin":
return scope == "cluster"
case "organization_portal":
return scope == "organization"
case "user_portal":
return scope == "user" || scope == "organization"
case "admin-ingress":
return scope == "platform" || scope == "cluster"
case "public-ingress":
return scope == "organization" || scope == "user"
default:
return false
}
@@ -143,18 +139,22 @@ func (d AdminRuntimeDispatcher) manifest(request FabricRequest) map[string]any {
sections := []string{}
actions := []string{}
switch serviceClass {
case "platform_admin":
case "admin-ingress":
sections = []string{"clusters", "nodes", "roles", "fabric", "workloads", "audit"}
actions = []string{"read_platform_summary", "read_cluster_summaries", "read_node_status"}
case "cluster_admin":
sections = []string{"cluster", "nodes", "fabric", "workloads", "audit"}
actions = []string{"read_cluster_summary", "read_node_status"}
case "organization_portal":
if request.Scope == "cluster" {
sections = []string{"cluster", "nodes", "fabric", "workloads", "audit"}
actions = []string{"read_cluster_summary", "read_node_status"}
} else {
actions = []string{"read_platform_summary", "read_cluster_summaries", "read_node_status"}
}
case "public-ingress":
sections = []string{"organization", "sessions", "resources", "audit"}
actions = []string{"read_organization_summary", "read_sessions"}
case "user_portal":
sections = []string{"profile", "sessions", "resources"}
actions = []string{"read_profile", "read_sessions"}
if request.Scope == "user" {
sections = []string{"profile", "sessions", "resources"}
actions = []string{"read_profile", "read_sessions"}
} else {
actions = []string{"read_organization_summary", "read_sessions"}
}
default:
sections = []string{"status"}
actions = []string{"read_status"}
@@ -14,7 +14,7 @@ func TestAdminRuntimeDispatcherReturnsHealthAndManifest(t *testing.T) {
Method: http.MethodGet,
Path: "/readyz",
Scope: "platform",
ServiceClass: "platform_admin",
ServiceClass: "admin-ingress",
})
if err != nil {
t.Fatalf("health: %v", err)
@@ -25,9 +25,9 @@ func TestAdminRuntimeDispatcherReturnsHealthAndManifest(t *testing.T) {
manifest, err := dispatcher.HandleFabricRequest(context.Background(), FabricRequest{
Method: http.MethodGet,
Path: "/platform-admin/ui-manifest",
Path: "/admin/ui-manifest",
Scope: "platform",
ServiceClass: "platform_admin",
ServiceClass: "admin-ingress",
})
if err != nil {
t.Fatalf("manifest: %v", err)
@@ -51,9 +51,9 @@ func TestAdminRuntimeDispatcherBlocksMutationsAndUnknownProjection(t *testing.T)
mutation, err := dispatcher.HandleFabricRequest(context.Background(), FabricRequest{
Method: http.MethodPost,
Path: "/platform-admin/nodes",
Path: "/admin/nodes",
Scope: "platform",
ServiceClass: "platform_admin",
ServiceClass: "admin-ingress",
})
if err != nil {
t.Fatalf("mutation: %v", err)
@@ -68,9 +68,9 @@ func TestAdminRuntimeDispatcherBlocksMutationsAndUnknownProjection(t *testing.T)
projection, err := dispatcher.HandleFabricRequest(context.Background(), FabricRequest{
Method: http.MethodGet,
Path: "/platform-admin/nodes",
Path: "/admin/nodes",
Scope: "platform",
ServiceClass: "platform_admin",
ServiceClass: "admin-ingress",
})
if err != nil {
t.Fatalf("projection: %v", err)
@@ -88,9 +88,9 @@ func TestAdminRuntimeDispatcherRejectsInvalidScopeClassPair(t *testing.T) {
dispatcher := AdminRuntimeDispatcher{ProjectionClient: &recordingProjectionClient{}, Now: fixedEnvelopeNow}
response, err := dispatcher.HandleFabricRequest(context.Background(), FabricRequest{
Method: http.MethodGet,
Path: "/platform-admin/ui-manifest",
Path: "/admin/ui-manifest",
Scope: "organization",
ServiceClass: "platform_admin",
ServiceClass: "admin-ingress",
})
if err != nil {
t.Fatalf("projection: %v", err)
@@ -118,11 +118,11 @@ func TestAdminRuntimeDispatcherUsesControlAPIProjectionClientForReadRequests(t *
response, err := dispatcher.HandleFabricRequest(context.Background(), FabricRequest{
Method: http.MethodGet,
Path: "/platform-admin/nodes",
Path: "/admin/nodes",
Query: "limit=10",
Host: "admin.example.test",
Scope: "platform",
ServiceClass: "platform_admin",
ServiceClass: "admin-ingress",
})
if err != nil {
t.Fatalf("projection: %v", err)
@@ -133,10 +133,10 @@ func TestAdminRuntimeDispatcherUsesControlAPIProjectionClientForReadRequests(t *
string(response.Body) != `{"schema_version":"control.projection.v1","ok":true}` {
t.Fatalf("response = %+v body=%s", response, string(response.Body))
}
if client.request.Path != "/platform-admin/nodes" ||
if client.request.Path != "/admin/nodes" ||
client.request.Query != "limit=10" ||
client.request.Scope != "platform" ||
client.request.ServiceClass != "platform_admin" {
client.request.ServiceClass != "admin-ingress" {
t.Fatalf("request = %+v", client.request)
}
}
@@ -145,9 +145,9 @@ func TestAdminRuntimeDispatcherReportsProjectionClientFailure(t *testing.T) {
dispatcher := AdminRuntimeDispatcher{ProjectionClient: failingProjectionClient{}, Now: fixedEnvelopeNow}
response, err := dispatcher.HandleFabricRequest(context.Background(), FabricRequest{
Method: http.MethodGet,
Path: "/platform-admin/nodes",
Path: "/admin/nodes",
Scope: "platform",
ServiceClass: "platform_admin",
ServiceClass: "admin-ingress",
})
if err != nil {
t.Fatalf("projection: %v", err)
@@ -175,9 +175,9 @@ func TestAdminRuntimeDispatcherRejectsInvalidProjectionResponseSchema(t *testing
}
response, err := dispatcher.HandleFabricRequest(context.Background(), FabricRequest{
Method: http.MethodGet,
Path: "/platform-admin/nodes",
Path: "/admin/nodes",
Scope: "platform",
ServiceClass: "platform_admin",
ServiceClass: "admin-ingress",
})
if err != nil {
t.Fatalf("projection: %v", err)
@@ -13,7 +13,6 @@ import (
type ListenerConfig struct {
RuntimeConfig
HTTPAddr string
HTTPSAddr string
TLSCertFile string
TLSKeyFile string
@@ -23,9 +22,7 @@ type ListenerConfig struct {
type ListenerStatus struct {
SchemaVersion string `json:"schema_version"`
Running bool `json:"running"`
HTTPRunning bool `json:"http_running"`
HTTPSRunning bool `json:"https_running"`
HTTPAddr string `json:"http_addr,omitempty"`
HTTPSAddr string `json:"https_addr,omitempty"`
Reason string `json:"reason,omitempty"`
Errors []string `json:"errors,omitempty"`
@@ -34,7 +31,6 @@ type ListenerStatus struct {
type Manager struct {
mu sync.Mutex
http *http.Server
https *http.Server
status ListenerStatus
now func() time.Time
@@ -56,19 +52,9 @@ func (m *Manager) Apply(ctx context.Context, cfg ListenerConfig) ListenerStatus
ObservedAt: m.observedAt(),
}
errorsOut := []string{}
if strings.TrimSpace(cfg.HTTPAddr) == "" {
cfg.HTTPAddr = ":80"
}
if strings.TrimSpace(cfg.HTTPSAddr) == "" {
cfg.HTTPSAddr = ":443"
}
if server, addr, err := startHTTPServer(ctx, cfg.HTTPAddr, runtime.HTTPHandler()); err == nil {
m.http = server
status.HTTPRunning = true
status.HTTPAddr = addr
} else {
errorsOut = append(errorsOut, "http:"+err.Error())
}
if cfg.TLSCertFile == "" || cfg.TLSKeyFile == "" {
errorsOut = append(errorsOut, "https:tls_cert_file_and_key_file_required")
} else if server, addr, err := startHTTPSServer(ctx, cfg.HTTPSAddr, cfg.TLSCertFile, cfg.TLSKeyFile, runtime.HTTPSHandler()); err == nil {
@@ -78,7 +64,7 @@ func (m *Manager) Apply(ctx context.Context, cfg ListenerConfig) ListenerStatus
} else {
errorsOut = append(errorsOut, "https:"+err.Error())
}
status.Running = status.HTTPRunning || status.HTTPSRunning
status.Running = status.HTTPSRunning
if len(errorsOut) > 0 {
status.Errors = errorsOut
if status.Running {
@@ -118,10 +104,6 @@ func (m *Manager) Status() ListenerStatus {
func (m *Manager) stopLocked(ctx context.Context) error {
var out error
if m.http != nil {
out = errors.Join(out, m.http.Shutdown(ctx))
m.http = nil
}
if m.https != nil {
out = errors.Join(out, m.https.Shutdown(ctx))
m.https = nil
@@ -137,24 +119,6 @@ func (m *Manager) observedAt() string {
return now.Format(time.RFC3339Nano)
}
func startHTTPServer(ctx context.Context, addr string, handler http.Handler) (*http.Server, string, error) {
listener, err := net.Listen("tcp", addr)
if err != nil {
return nil, "", err
}
server := &http.Server{Handler: handler, ReadHeaderTimeout: 5 * time.Second}
go func() {
<-ctx.Done()
_ = server.Shutdown(context.Background())
}()
go func() {
if err := server.Serve(listener); err != nil && !errors.Is(err, http.ErrServerClosed) {
_ = server.Close()
}
}()
return server, listener.Addr().String(), nil
}
func startHTTPSServer(ctx context.Context, addr, certFile, keyFile string, handler http.Handler) (*http.Server, string, error) {
cert, err := tls.LoadX509KeyPair(certFile, keyFile)
if err != nil {
@@ -8,7 +8,6 @@ import (
"crypto/x509/pkix"
"encoding/pem"
"math/big"
"net/http"
"os"
"path/filepath"
"strings"
@@ -16,37 +15,6 @@ import (
"time"
)
func TestManagerStartsHTTPRedirectAndStops(t *testing.T) {
manager := NewManager()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
status := manager.Apply(ctx, ListenerConfig{
RuntimeConfig: RuntimeConfig{ServiceType: "admin-ingress", Scope: "platform", ServiceClasses: []string{"platform_admin"}},
HTTPAddr: "127.0.0.1:0",
HTTPSAddr: "127.0.0.1:0",
})
if !status.HTTPRunning || status.HTTPSRunning || !status.Running || status.HTTPAddr == "" {
t.Fatalf("status = %+v", status)
}
if status.Reason != "partial" || !containsError(status.Errors, "https:tls_cert_file_and_key_file_required") {
t.Fatalf("status = %+v", status)
}
client := &http.Client{CheckRedirect: func(*http.Request, []*http.Request) error { return http.ErrUseLastResponse }}
resp, err := client.Get("http://" + status.HTTPAddr + "/cluster-admin")
if err != nil {
t.Fatalf("http get: %v", err)
}
_ = resp.Body.Close()
if resp.StatusCode != http.StatusPermanentRedirect {
t.Fatalf("status = %d", resp.StatusCode)
}
stopped := manager.Stop(context.Background())
if stopped.Running || stopped.Reason != "stopped" {
t.Fatalf("stopped = %+v", stopped)
}
}
func TestManagerStartsHTTPSWhenCertificateProvided(t *testing.T) {
dir := t.TempDir()
certFile, keyFile := writeSelfSignedCert(t, dir)
@@ -56,12 +24,29 @@ func TestManagerStartsHTTPSWhenCertificateProvided(t *testing.T) {
status := manager.Apply(ctx, ListenerConfig{
RuntimeConfig: RuntimeConfig{ServiceType: "admin-ingress", Scope: "platform", ServiceClasses: []string{"platform_admin"}},
HTTPAddr: "127.0.0.1:0",
HTTPSAddr: "127.0.0.1:0",
TLSCertFile: certFile,
TLSKeyFile: keyFile,
})
if !status.HTTPRunning || !status.HTTPSRunning || status.HTTPAddr == "" || status.HTTPSAddr == "" || len(status.Errors) != 0 {
if !status.HTTPSRunning || !status.Running || status.HTTPSAddr == "" || len(status.Errors) != 0 {
t.Fatalf("status = %+v", status)
}
}
func TestManagerDoesNotStartHTTPWithoutExplicitAddress(t *testing.T) {
dir := t.TempDir()
certFile, keyFile := writeSelfSignedCert(t, dir)
manager := NewManager()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
status := manager.Apply(ctx, ListenerConfig{
RuntimeConfig: RuntimeConfig{ServiceType: "admin-ingress", Scope: "platform", ServiceClasses: []string{"platform_admin"}},
HTTPSAddr: "127.0.0.1:0",
TLSCertFile: certFile,
TLSKeyFile: keyFile,
})
if !status.HTTPSRunning || !status.Running || status.HTTPSAddr == "" || len(status.Errors) != 0 {
t.Fatalf("status = %+v", status)
}
}
@@ -14,7 +14,6 @@ type RuntimeConfig struct {
Scope string
ServiceClasses []string
TLSMode string
HTTPPort int
HTTPSPort int
}
@@ -59,23 +58,6 @@ type Response struct {
ObservedAt string `json:"observed_at"`
}
func (r Runtime) HTTPHandler() http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
if strings.HasPrefix(req.URL.Path, "/.well-known/acme-challenge/") {
writeJSON(w, http.StatusNotFound, r.response("not_found", "acme_challenge_backend_not_configured", ""))
return
}
if req.URL.Path == "/healthz" || req.URL.Path == "/readyz" {
writeJSON(w, http.StatusOK, r.response("ready", "http_redirect_runtime_ready", ""))
return
}
target := "https://" + req.Host + req.URL.RequestURI()
w.Header().Set("Location", target)
w.Header().Set("Cache-Control", "no-store")
w.WriteHeader(http.StatusPermanentRedirect)
})
}
func (r Runtime) HTTPSHandler() http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
if req.URL.Path == "/healthz" || req.URL.Path == "/readyz" {
@@ -98,7 +80,7 @@ func (r Runtime) HTTPSHandler() http.Handler {
writeJSON(w, http.StatusNotImplemented, r.response("blocked", "fabric_service_channel_binding_not_implemented", serviceClass))
return
}
scope := scopeForServiceClass(serviceClass, r.Config.Scope)
scope := scopeForServiceClass(serviceClass, req.URL.Path, r.Config.Scope)
body, err := io.ReadAll(http.MaxBytesReader(w, req.Body, 1<<20))
if err != nil {
writeJSON(w, http.StatusRequestEntityTooLarge, r.response("blocked", "request_body_too_large", serviceClass))
@@ -146,32 +128,38 @@ func (r Runtime) response(status, reason, serviceClass string) Response {
}
}
func scopeForServiceClass(serviceClass string, fallback string) string {
func scopeForServiceClass(serviceClass string, path string, fallback string) string {
path = strings.Trim(strings.ToLower(path), "/")
switch strings.TrimSpace(serviceClass) {
case "platform_admin":
return "platform"
case "cluster_admin":
return "cluster"
case "organization_portal":
return "organization"
case "user_portal":
return "user"
case "admin-ingress":
if strings.HasPrefix(path, "clusters/") {
return "cluster"
}
return firstNonEmpty(strings.TrimSpace(fallback), "platform")
case "public-ingress":
if strings.HasPrefix(path, "users/") {
return "user"
}
return firstNonEmpty(strings.TrimSpace(fallback), "organization")
default:
return strings.TrimSpace(fallback)
}
}
func firstNonEmpty(value string, fallback string) string {
if strings.TrimSpace(value) != "" {
return strings.TrimSpace(value)
}
return strings.TrimSpace(fallback)
}
func serviceClassFromPath(path string) string {
path = strings.Trim(strings.ToLower(path), "/")
switch {
case strings.HasPrefix(path, "platform-admin"):
return "platform_admin"
case strings.HasPrefix(path, "cluster-admin"):
return "cluster_admin"
case strings.HasPrefix(path, "organizations/"):
return "organization_portal"
case strings.HasPrefix(path, "users/"):
return "user_portal"
case strings.HasPrefix(path, "admin/"), strings.HasPrefix(path, "platform/"), strings.HasPrefix(path, "clusters/"):
return "admin-ingress"
case strings.HasPrefix(path, "public/"), strings.HasPrefix(path, "organizations/"), strings.HasPrefix(path, "users/"):
return "public-ingress"
default:
return ""
}
@@ -10,31 +10,16 @@ import (
"time"
)
func TestHTTPHandlerRedirectsToHTTPS(t *testing.T) {
runtime := Runtime{Config: RuntimeConfig{ServiceType: "admin-ingress", Scope: "platform"}}
req := httptest.NewRequest(http.MethodGet, "http://admin.example.test/cluster-admin/dashboard?x=1", nil)
rec := httptest.NewRecorder()
runtime.HTTPHandler().ServeHTTP(rec, req)
if rec.Code != http.StatusPermanentRedirect {
t.Fatalf("status = %d", rec.Code)
}
if rec.Header().Get("Location") != "https://admin.example.test/cluster-admin/dashboard?x=1" {
t.Fatalf("Location = %q", rec.Header().Get("Location"))
}
}
func TestHTTPSHandlerBlocksUnknownServiceClass(t *testing.T) {
runtime := Runtime{
Config: RuntimeConfig{
ServiceType: "public-ingress",
Scope: "organization",
ServiceClasses: []string{"organization_portal", "user_portal"},
ServiceClasses: []string{"public-ingress", "public-ingress"},
},
Now: fixedNow,
}
req := httptest.NewRequest(http.MethodGet, "https://org.example.test/platform-admin/root", nil)
req := httptest.NewRequest(http.MethodGet, "https://org.example.test/admin/root", nil)
rec := httptest.NewRecorder()
runtime.HTTPSHandler().ServeHTTP(rec, req)
@@ -46,7 +31,7 @@ func TestHTTPSHandlerBlocksUnknownServiceClass(t *testing.T) {
if err := json.Unmarshal(rec.Body.Bytes(), &payload); err != nil {
t.Fatalf("decode response: %v", err)
}
if payload.Reason != "service_class_not_allowed" || payload.ServiceClass != "platform_admin" || payload.Scope != "organization" {
if payload.Reason != "service_class_not_allowed" || payload.ServiceClass != "admin-ingress" || payload.Scope != "organization" {
t.Fatalf("payload = %+v", payload)
}
}
@@ -56,11 +41,11 @@ func TestHTTPSHandlerRequiresFabricServiceChannelBinding(t *testing.T) {
Config: RuntimeConfig{
ServiceType: "admin-ingress",
Scope: "platform",
ServiceClasses: []string{"platform_admin", "cluster_admin"},
ServiceClasses: []string{"admin-ingress", "admin-ingress"},
},
Now: fixedNow,
}
req := httptest.NewRequest(http.MethodPost, "https://admin.example.test/platform-admin/root", nil)
req := httptest.NewRequest(http.MethodPost, "https://admin.example.test/admin/root", nil)
rec := httptest.NewRecorder()
runtime.HTTPSHandler().ServeHTTP(rec, req)
@@ -73,7 +58,7 @@ func TestHTTPSHandlerRequiresFabricServiceChannelBinding(t *testing.T) {
t.Fatalf("decode response: %v", err)
}
if payload.Reason != "fabric_service_channel_binding_not_implemented" ||
payload.ServiceClass != "platform_admin" ||
payload.ServiceClass != "admin-ingress" ||
payload.ObservedAt != "2026-05-17T00:00:00Z" {
t.Fatalf("payload = %+v", payload)
}
@@ -91,13 +76,13 @@ func TestHTTPSHandlerForwardsAllowedRequestToBinder(t *testing.T) {
Config: RuntimeConfig{
ServiceType: "admin-ingress",
Scope: "platform",
ServiceClasses: []string{"platform_admin", "cluster_admin"},
ServiceClasses: []string{"admin-ingress", "admin-ingress"},
},
Binder: binder,
Now: fixedNow,
}
req := httptest.NewRequest(http.MethodPost, "https://admin.example.test/platform-admin/root?tab=nodes", strings.NewReader(`{"hello":"world"}`))
req.Header.Set("X-RAP-Service-Class", "platform_admin")
req := httptest.NewRequest(http.MethodPost, "https://admin.example.test/admin/root?tab=nodes", strings.NewReader(`{"hello":"world"}`))
req.Header.Set("X-RAP-Service-Class", "admin-ingress")
req.Header.Set("Authorization", "Bearer secret")
req.Header.Set("X-Trace-ID", "trace-1")
rec := httptest.NewRecorder()
@@ -110,9 +95,9 @@ func TestHTTPSHandlerForwardsAllowedRequestToBinder(t *testing.T) {
if rec.Header().Get("X-RAP-Result") != "accepted" || rec.Body.String() != `{"ok":true}` {
t.Fatalf("unexpected response headers=%v body=%s", rec.Header(), rec.Body.String())
}
if binder.request.ServiceClass != "platform_admin" ||
if binder.request.ServiceClass != "admin-ingress" ||
binder.request.Scope != "platform" ||
binder.request.Path != "/platform-admin/root" ||
binder.request.Path != "/admin/root" ||
binder.request.Query != "tab=nodes" ||
string(binder.request.Body) != `{"hello":"world"}` {
t.Fatalf("request = %+v", binder.request)
@@ -128,12 +113,12 @@ func TestHTTPSHandlerDerivesFabricScopeFromServiceClass(t *testing.T) {
Config: RuntimeConfig{
ServiceType: "admin-ingress",
Scope: "platform",
ServiceClasses: []string{"platform_admin", "cluster_admin"},
ServiceClasses: []string{"admin-ingress", "admin-ingress"},
},
Binder: binder,
Now: fixedNow,
}
req := httptest.NewRequest(http.MethodGet, "https://admin.example.test/cluster-admin/ui-manifest", nil)
req := httptest.NewRequest(http.MethodGet, "https://admin.example.test/clusters/ui-manifest", nil)
rec := httptest.NewRecorder()
runtime.HTTPSHandler().ServeHTTP(rec, req)
@@ -141,18 +126,18 @@ func TestHTTPSHandlerDerivesFabricScopeFromServiceClass(t *testing.T) {
if rec.Code != http.StatusOK {
t.Fatalf("status = %d body=%s", rec.Code, rec.Body.String())
}
if binder.request.ServiceClass != "cluster_admin" || binder.request.Scope != "cluster" {
if binder.request.ServiceClass != "admin-ingress" || binder.request.Scope != "cluster" {
t.Fatalf("request = %+v", binder.request)
}
}
func TestHTTPSHandlerReportsBinderFailure(t *testing.T) {
runtime := Runtime{
Config: RuntimeConfig{ServiceType: "admin-ingress", Scope: "platform", ServiceClasses: []string{"platform_admin"}},
Config: RuntimeConfig{ServiceType: "admin-ingress", Scope: "platform", ServiceClasses: []string{"admin-ingress"}},
Binder: failingBinder{},
Now: fixedNow,
}
req := httptest.NewRequest(http.MethodPost, "https://admin.example.test/platform-admin/root", nil)
req := httptest.NewRequest(http.MethodPost, "https://admin.example.test/admin/root", nil)
rec := httptest.NewRecorder()
runtime.HTTPSHandler().ServeHTTP(rec, req)
@@ -3,6 +3,7 @@ package fabricvpn
import (
"context"
"crypto/tls"
"encoding/binary"
"encoding/json"
"fmt"
"net"
@@ -18,6 +19,13 @@ import (
"github.com/quic-go/quic-go"
)
const (
defaultRuntimeStreamShards = 8
maxRuntimeStreamShards = 128
minPacketBatchSendTimeout = 5 * time.Second
maxPacketBatchSendTimeout = 30 * time.Second
)
type endpointConfig struct {
EndpointID string `json:"endpoint_id"`
NodeID string `json:"node_id"`
@@ -31,8 +39,15 @@ type endpointConfig struct {
type runtimeConfig struct {
ClusterID string `json:"cluster_id"`
LocalNodeID string `json:"local_node_id"`
ExitNodeID string `json:"exit_node_id"`
VPNConnectionID string `json:"vpn_connection_id"`
TunnelID string `json:"tunnel_id"`
PoolID string `json:"pool_id"`
ServiceID string `json:"service_id"`
LocalServiceID string `json:"local_service_id"`
RemoteServiceID string `json:"remote_service_id"`
ServiceKind string `json:"service_kind"`
ServiceClass string `json:"service_class"`
RouteLeaseID string `json:"route_lease_id"`
RouteGeneration string `json:"route_generation"`
Endpoints []endpointConfig `json:"endpoints"`
RouteBundle routeBundleConfig `json:"route_bundle"`
ServiceChannelRequest serviceChannelRequest `json:"service_channel_request"`
@@ -56,6 +71,7 @@ type routeBundleConfig struct {
type routeLeaseConfig struct {
SchemaVersion string `json:"schema_version"`
LeaseID string `json:"lease_id"`
Generation string `json:"generation"`
SelectedTargetNode string `json:"selected_target_node"`
PrimaryPath routeLeasePath `json:"primary_path"`
WarmStandbyPaths []routeLeasePath `json:"warm_standby_paths"`
@@ -82,17 +98,19 @@ type SocketProtector interface {
}
type Manager struct {
opMu sync.Mutex
mu sync.Mutex
cancel context.CancelFunc
transport *mesh.QUICFabricTransport
session mesh.FabricTransportSession
packet *vpnruntime.FabricSessionPacketTransport
inbox *vpnruntime.FabricPacketInbox
cfg runtimeConfig
lastErr string
endpoint string
protector SocketProtector
opMu sync.Mutex
mu sync.Mutex
cancel context.CancelFunc
heartbeatCancel context.CancelFunc
transport *mesh.QUICFabricTransport
session mesh.FabricTransportSession
packet *vpnruntime.FabricSessionPacketTransport
inbox *vpnruntime.FabricPacketInbox
serviceStreams *vpnruntime.FabricServiceStreamRegistry
cfg runtimeConfig
lastErr string
endpoint string
protector SocketProtector
uplinkPackets atomic.Uint64
uplinkBytes atomic.Uint64
@@ -100,6 +118,14 @@ type Manager struct {
downlinkBytes atomic.Uint64
}
type fabricEndpointConnectResult struct {
endpoint endpointConfig
session mesh.FabricTransportSession
streamIDs map[string][]uint64
streamID uint64
err error
}
func NewManager() *Manager {
return &Manager{}
}
@@ -117,12 +143,28 @@ func (m *Manager) Start(configJSON string) error {
}
cfg.ClusterID = strings.TrimSpace(cfg.ClusterID)
cfg.LocalNodeID = strings.TrimSpace(cfg.LocalNodeID)
cfg.ExitNodeID = strings.TrimSpace(cfg.ExitNodeID)
cfg.VPNConnectionID = strings.TrimSpace(cfg.VPNConnectionID)
cfg.TunnelID = strings.TrimSpace(cfg.TunnelID)
cfg.PoolID = strings.TrimSpace(cfg.PoolID)
cfg.ServiceID = strings.TrimSpace(cfg.ServiceID)
cfg.LocalServiceID = strings.TrimSpace(cfg.LocalServiceID)
cfg.RemoteServiceID = strings.TrimSpace(cfg.RemoteServiceID)
cfg.ServiceKind = strings.TrimSpace(cfg.ServiceKind)
cfg.ServiceClass = strings.TrimSpace(cfg.ServiceClass)
cfg.RouteLeaseID = strings.TrimSpace(firstNonEmpty(cfg.RouteLeaseID, cfg.RouteBundle.RouteLease.LeaseID))
cfg.RouteGeneration = strings.TrimSpace(firstNonEmpty(cfg.RouteGeneration, cfg.RouteBundle.RouteLease.Generation, cfg.RouteBundle.RouteLease.LeaseID))
cfg.TunnelID = firstNonEmpty(cfg.TunnelID)
if cfg.PoolID == "" {
cfg.PoolID = vpnruntime.DefaultFabricTunnelPoolID
}
if cfg.ServiceClass == "" {
cfg.ServiceClass = vpnruntime.DefaultFabricTunnelClass
}
if cfg.ServiceKind == "" {
cfg.ServiceKind = vpnruntime.DefaultFabricTunnelServiceKind
}
cfg.Endpoints = fabricRuntimeEndpoints(cfg)
cfg.ExitNodeID = firstNonEmpty(cfg.ExitNodeID, fabricRuntimeTargetNodeID(cfg))
if cfg.ClusterID == "" || cfg.LocalNodeID == "" || cfg.VPNConnectionID == "" {
return fmt.Errorf("cluster, local node and vpn connection id are required")
if cfg.ClusterID == "" || cfg.LocalNodeID == "" || cfg.TunnelID == "" {
return fmt.Errorf("cluster, local node and fabric tunnel id are required")
}
if strings.TrimSpace(cfg.ServiceChannelRequest.SchemaVersion) == "" {
return fmt.Errorf("fabric service channel request is required")
@@ -131,10 +173,10 @@ func (m *Manager) Start(configJSON string) error {
return fmt.Errorf("fabric route lease has no QUIC candidates")
}
if cfg.StreamShards <= 0 {
cfg.StreamShards = 4
cfg.StreamShards = defaultRuntimeStreamShards
}
if cfg.StreamShards > 32 {
cfg.StreamShards = 32
if cfg.StreamShards > maxRuntimeStreamShards {
cfg.StreamShards = maxRuntimeStreamShards
}
m.Stop()
@@ -187,43 +229,22 @@ func (m *Manager) connect(ctx context.Context, cfg runtimeConfig, cancel context
return mesh.ProductionForwardResult{Delivered: true, MessageID: envelope.MessageID}, nil
}, nil, nil)
var lastErr error
for _, endpoint := range cfg.Endpoints {
target := mesh.FabricTransportTarget{
EndpointID: firstNonEmpty(endpoint.EndpointID, endpoint.Address),
PeerID: firstNonEmpty(endpoint.NodeID, cfg.ExitNodeID),
Endpoint: endpoint.Address,
Transport: firstNonEmpty(endpoint.Transport, "direct_quic"),
PeerCertSHA256: firstNonEmpty(endpoint.PeerCertSHA256, endpoint.TLSCertSHA256),
Timeout: 5 * time.Second,
OutboundBuffer: 512,
InboundBuffer: 512,
ErrorBuffer: 32,
}
carrier, selected, err := mesh.FabricTransportForTarget(target, quicTransport)
if err != nil {
lastErr = err
continue
}
dialCtx, dialCancel := context.WithTimeout(ctx, 5*time.Second)
session, err := carrier.Connect(dialCtx, selected)
if err != nil {
dialCancel()
lastErr = err
continue
}
streamIDs, streamID, err := openStreams(dialCtx, session, cfg.StreamShards)
dialCancel()
if err != nil {
_ = session.Close()
lastErr = err
continue
}
result, err := m.connectFastestEndpoint(ctx, cfg, quicTransport)
if err != nil {
return err
}
endpoint := result.endpoint
session := result.session
streamIDs := result.streamIDs
streamID := result.streamID
heartbeatCtx, heartbeatCancel := context.WithCancel(context.Background())
m.mu.Lock()
m.cancel = cancel
m.heartbeatCancel = heartbeatCancel
m.transport = quicTransport
m.session = session
m.inbox = inbox
m.serviceStreams = vpnruntime.NewFabricServiceStreamRegistry()
m.cfg = cfg
m.endpoint = endpoint.Address
m.lastErr = ""
@@ -232,18 +253,219 @@ func (m *Manager) connect(ctx context.Context, cfg runtimeConfig, cancel context
Receiver: session,
Inbox: inbox,
StreamID: streamID,
ServiceStreams: m.serviceStreams,
ServiceTunnel: serviceTunnelFromRuntimeConfig(cfg),
StreamIDsByTrafficClass: streamIDs,
VPNConnectionID: cfg.VPNConnectionID,
TunnelID: cfg.TunnelID,
PoolID: cfg.PoolID,
ServiceID: cfg.ServiceID,
VPNConnectionID: cfg.TunnelID,
SendDirection: vpnruntime.FabricDirectionClientToGateway,
ReceiveDirection: vpnruntime.FabricDirectionGatewayToClient,
}
m.mu.Unlock()
announceCtx, announceCancel := context.WithTimeout(context.Background(), 2*time.Second)
announceErr := announceVPNSessionStreams(announceCtx, session, serviceTunnelFromRuntimeConfig(cfg), streamIDs, streamID)
announceCancel()
if announceErr != nil {
m.setErr(announceErr)
}
go m.runVPNSessionHeartbeat(heartbeatCtx, session, streamIDs, streamID)
return nil
}
func (m *Manager) connectFastestEndpoint(ctx context.Context, cfg runtimeConfig, quicTransport *mesh.QUICFabricTransport) (fabricEndpointConnectResult, error) {
if len(cfg.Endpoints) == 0 {
return fabricEndpointConnectResult{}, fmt.Errorf("no QUIC exit endpoints available")
}
connectCtx, connectCancel := context.WithCancel(ctx)
defer connectCancel()
endpointGroups := groupEndpointsByPeer(cfg)
results := make(chan fabricEndpointConnectResult, len(endpointGroups))
attempts := 0
for _, group := range endpointGroups {
attempts++
go func(group []endpointConfig) {
var last fabricEndpointConnectResult
for _, endpoint := range group {
target := fabricRuntimePacketTarget(cfg, endpoint)
carrier, selected, err := mesh.FabricTransportForTarget(target, quicTransport)
if err != nil {
last = fabricEndpointConnectResult{endpoint: endpoint, err: err}
continue
}
dialCtx, dialCancel := context.WithTimeout(connectCtx, 5*time.Second)
session, err := carrier.Connect(dialCtx, selected)
if err != nil {
dialCancel()
last = fabricEndpointConnectResult{endpoint: endpoint, err: err}
continue
}
streamIDs, streamID, err := openStreams(dialCtx, session, cfg.StreamShards)
dialCancel()
if err != nil {
_ = session.Close()
last = fabricEndpointConnectResult{endpoint: endpoint, err: err}
continue
}
results <- fabricEndpointConnectResult{
endpoint: endpoint,
session: session,
streamIDs: streamIDs,
streamID: streamID,
}
return
}
if last.err == nil {
last.err = fmt.Errorf("no endpoint attempt completed for peer")
}
results <- last
}(group)
}
var lastErr error
for index := 0; index < attempts; index++ {
select {
case <-ctx.Done():
if lastErr != nil {
return fabricEndpointConnectResult{}, lastErr
}
return fabricEndpointConnectResult{}, ctx.Err()
case result := <-results:
if result.err != nil {
lastErr = result.err
continue
}
connectCancel()
go closeLateFabricSessions(results, attempts-index-1)
return result, nil
}
}
if lastErr == nil {
lastErr = fmt.Errorf("no QUIC exit endpoints available")
lastErr = fmt.Errorf("no endpoint attempt completed")
}
return fmt.Errorf("fabric bootstrap failed after %d endpoint candidates: %w", len(cfg.Endpoints), lastErr)
return fabricEndpointConnectResult{}, fmt.Errorf("fabric bootstrap failed after %d endpoint candidates: %w", len(cfg.Endpoints), lastErr)
}
func groupEndpointsByPeer(cfg runtimeConfig) [][]endpointConfig {
groups := make([][]endpointConfig, 0, len(cfg.Endpoints))
indexByPeer := map[string]int{}
for _, endpoint := range cfg.Endpoints {
peer := endpointPeerKey(cfg, endpoint)
if index, ok := indexByPeer[peer]; ok {
groups[index] = append(groups[index], endpoint)
continue
}
indexByPeer[peer] = len(groups)
groups = append(groups, []endpointConfig{endpoint})
}
return groups
}
func endpointPeerKey(cfg runtimeConfig, endpoint endpointConfig) string {
if value := strings.TrimSpace(endpoint.NodeID); value != "" {
return value
}
if value := strings.TrimSpace(fabricRuntimeTargetNodeID(cfg)); value != "" {
return value
}
return firstNonEmpty(endpoint.EndpointID, endpoint.Address)
}
func closeLateFabricSessions(results <-chan fabricEndpointConnectResult, remaining int) {
for index := 0; index < remaining; index++ {
result := <-results
if result.session != nil {
_ = result.session.Close()
}
}
}
func fabricRuntimePacketTarget(cfg runtimeConfig, endpoint endpointConfig) mesh.FabricTransportTarget {
return mesh.FabricTransportTarget{
EndpointID: firstNonEmpty(endpoint.EndpointID, endpoint.Address),
PeerID: firstNonEmpty(endpoint.NodeID, fabricRuntimeTargetNodeID(cfg)),
Endpoint: endpoint.Address,
Transport: firstNonEmpty(endpoint.Transport, "direct_quic"),
PeerCertSHA256: firstNonEmpty(endpoint.PeerCertSHA256, endpoint.TLSCertSHA256),
OutboundBuffer: 4096,
InboundBuffer: 4096,
ErrorBuffer: 128,
}
}
func announceVPNSessionStreams(ctx context.Context, session mesh.FabricTransportSession, serviceTunnel vpnruntime.FabricServiceTunnel, streamIDsByClass map[string][]uint64, fallbackStreamID uint64) error {
serviceTunnel = vpnruntime.NormalizeServiceTunnel(serviceTunnel, serviceTunnel.TunnelID)
if session == nil || strings.TrimSpace(serviceTunnel.TunnelID) == "" {
return fmt.Errorf("fabric vpn session announce requires an active session")
}
announced := map[uint64]bool{}
sequence := uint64(time.Now().UnixNano())
for trafficClass, streamIDs := range streamIDsByClass {
for _, streamID := range streamIDs {
if streamID == 0 || announced[streamID] {
continue
}
sequence++
frame, err := vpnruntime.NewFabricVPNSessionHelloFrame(vpnruntime.FabricVPNPacketFrameInput{
StreamID: streamID,
Sequence: sequence,
VPNConnectionID: serviceTunnel.TunnelID,
Direction: vpnruntime.FabricDirectionClientToGateway,
TrafficClass: trafficClass,
ServiceTunnel: serviceTunnel,
})
if err != nil {
return err
}
if err := session.Send(ctx, frame); err != nil {
return err
}
announced[streamID] = true
}
}
if len(announced) == 0 && fallbackStreamID != 0 {
frame, err := vpnruntime.NewFabricVPNSessionHelloFrame(vpnruntime.FabricVPNPacketFrameInput{
StreamID: fallbackStreamID,
Sequence: sequence + 1,
VPNConnectionID: serviceTunnel.TunnelID,
Direction: vpnruntime.FabricDirectionClientToGateway,
TrafficClass: vpnruntime.FabricTrafficClassBulk,
ServiceTunnel: serviceTunnel,
})
if err != nil {
return err
}
if err := session.Send(ctx, frame); err != nil {
return err
}
}
return nil
}
func (m *Manager) runVPNSessionHeartbeat(ctx context.Context, session mesh.FabricTransportSession, streamIDsByClass map[string][]uint64, fallbackStreamID uint64) {
ticker := time.NewTicker(10 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
heartbeatCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
err := announceVPNSessionStreams(heartbeatCtx, session, m.currentServiceTunnel(), streamIDsByClass, fallbackStreamID)
cancel()
if err != nil {
m.setErr(err)
return
}
}
}
}
func (m *Manager) currentServiceTunnel() vpnruntime.FabricServiceTunnel {
m.mu.Lock()
cfg := m.cfg
m.mu.Unlock()
return serviceTunnelFromRuntimeConfig(cfg)
}
func (m *Manager) protectedQUICDialer() func(context.Context, string, *tls.Config, *quic.Config) (*quic.Conn, error) {
@@ -300,16 +522,22 @@ func (m *Manager) Stop() {
func (m *Manager) stopLocked() {
m.mu.Lock()
cancel := m.cancel
heartbeatCancel := m.heartbeatCancel
session := m.session
transport := m.transport
m.cancel = nil
m.heartbeatCancel = nil
m.session = nil
m.transport = nil
m.packet = nil
m.serviceStreams = nil
m.mu.Unlock()
if cancel != nil {
cancel()
}
if heartbeatCancel != nil {
heartbeatCancel()
}
if session != nil {
_ = session.Close()
}
@@ -322,33 +550,30 @@ func (m *Manager) SendPacket(packet []byte) error {
if len(packet) == 0 {
return nil
}
m.opMu.Lock()
defer m.opMu.Unlock()
if err := m.ensureConnectedLocked(); err != nil {
return err
}
m.mu.Lock()
transport := m.packet
m.mu.Unlock()
transport := m.packetTransport()
if transport == nil {
return fmt.Errorf("fabric vpn runtime is not connected")
var err error
transport, err = m.reconnectPacketTransport()
if err != nil || transport == nil {
return fmt.Errorf("fabric vpn runtime is not connected")
}
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
packetBatch := [][]byte{append([]byte(nil), packet...)}
ctx, cancel := context.WithTimeout(context.Background(), packetBatchSendTimeout(packetBatch))
defer cancel()
if err := transport.SendGatewayPacketBatch(ctx, [][]byte{append([]byte(nil), packet...)}); err != nil {
if err := transport.SendGatewayPacketBatch(ctx, packetBatch); err != nil {
m.setErr(err)
if reconnectErr := m.reconnectLocked(); reconnectErr != nil {
transport, reconnectErr := m.reconnectPacketTransport()
if reconnectErr != nil {
return err
}
m.mu.Lock()
transport = m.packet
m.mu.Unlock()
if transport == nil {
return err
}
retryCtx, retryCancel := context.WithTimeout(context.Background(), 5*time.Second)
retryPacketBatch := [][]byte{append([]byte(nil), packet...)}
retryCtx, retryCancel := context.WithTimeout(context.Background(), packetBatchSendTimeout(retryPacketBatch))
defer retryCancel()
if retryErr := transport.SendGatewayPacketBatch(retryCtx, [][]byte{append([]byte(nil), packet...)}); retryErr != nil {
if retryErr := transport.SendGatewayPacketBatch(retryCtx, retryPacketBatch); retryErr != nil {
m.setErr(retryErr)
return retryErr
}
@@ -358,17 +583,94 @@ func (m *Manager) SendPacket(packet []byte) error {
return nil
}
func (m *Manager) SendPacketBatchPayload(payload []byte) error {
packets, err := decodePacketBatchPayload(payload)
if err != nil {
return err
}
if len(packets) == 0 {
return nil
}
transport := m.packetTransport()
if transport == nil {
var err error
transport, err = m.reconnectPacketTransport()
if err != nil || transport == nil {
return fmt.Errorf("fabric vpn runtime is not connected")
}
}
sendTimeout := packetBatchSendTimeout(packets)
ctx, cancel := context.WithTimeout(context.Background(), sendTimeout)
defer cancel()
if err := transport.SendGatewayPacketBatch(ctx, clonePacketBatch(packets)); err != nil {
m.setErr(err)
transport, reconnectErr := m.reconnectPacketTransport()
if reconnectErr != nil {
return err
}
if transport == nil {
return err
}
retryCtx, retryCancel := context.WithTimeout(context.Background(), sendTimeout)
defer retryCancel()
if retryErr := transport.SendGatewayPacketBatch(retryCtx, clonePacketBatch(packets)); retryErr != nil {
m.setErr(retryErr)
return retryErr
}
}
var bytes uint64
for _, packet := range packets {
bytes += uint64(len(packet))
}
m.uplinkPackets.Add(uint64(len(packets)))
m.uplinkBytes.Add(bytes)
return nil
}
func packetBatchSendTimeout(packets [][]byte) time.Duration {
if len(packets) == 0 {
return minPacketBatchSendTimeout
}
var bytes int
for _, packet := range packets {
bytes += len(packet)
}
timeout := minPacketBatchSendTimeout
if bytes > 0 {
timeout += time.Duration(bytes/(512*1024)) * time.Second
}
if len(packets) > 512 {
timeout += time.Duration(len(packets)/512) * time.Second
}
if timeout > maxPacketBatchSendTimeout {
return maxPacketBatchSendTimeout
}
return timeout
}
func (m *Manager) ReceivePacket(timeoutMillis int) ([]byte, error) {
m.opMu.Lock()
defer m.opMu.Unlock()
if err := m.ensureConnectedLocked(); err != nil {
payload, err := m.ReceivePacketBatchPayload(timeoutMillis)
if err != nil {
return nil, err
}
m.mu.Lock()
transport := m.packet
m.mu.Unlock()
packets, err := decodePacketBatchPayload(payload)
if err != nil {
return nil, err
}
if len(packets) == 0 {
return nil, nil
}
return append([]byte(nil), packets[0]...), nil
}
func (m *Manager) ReceivePacketBatchPayload(timeoutMillis int) ([]byte, error) {
transport := m.packetTransport()
if transport == nil {
return nil, fmt.Errorf("fabric vpn runtime is not connected")
var err error
transport, err = m.reconnectPacketTransport()
if err != nil || transport == nil {
return nil, fmt.Errorf("fabric vpn runtime is not connected")
}
}
timeout := time.Duration(timeoutMillis) * time.Millisecond
if timeout <= 0 {
@@ -379,16 +681,19 @@ func (m *Manager) ReceivePacket(timeoutMillis int) ([]byte, error) {
packets, err := transport.ReceiveGatewayPacketBatch(ctx, timeout)
if err != nil {
m.setErr(err)
_ = m.reconnectLocked()
_, _ = m.reconnectPacketTransport()
return nil, err
}
if len(packets) == 0 {
return nil, nil
}
packet := append([]byte(nil), packets[0]...)
m.downlinkPackets.Add(1)
m.downlinkBytes.Add(uint64(len(packet)))
return packet, nil
var bytes uint64
for _, packet := range packets {
bytes += uint64(len(packet))
}
m.downlinkPackets.Add(uint64(len(packets)))
m.downlinkBytes.Add(bytes)
return encodePacketBatchPayload(packets), nil
}
func (m *Manager) ControlRequest(payloadJSON string) (string, error) {
@@ -402,19 +707,63 @@ func (m *Manager) ControlRequest(payloadJSON string) (string, error) {
cfg := m.cfg
endpointAddress := m.endpoint
m.mu.Unlock()
if transport == nil || endpointAddress == "" {
if transport == nil {
return "", fmt.Errorf("fabric control runtime is not connected")
}
endpoint := endpointConfig{Address: endpointAddress}
for _, candidate := range cfg.Endpoints {
if strings.TrimSpace(candidate.Address) == endpointAddress {
endpoint = candidate
break
}
candidates := prioritizeControlEndpoints(cfg.Endpoints, endpointAddress)
if len(candidates) == 0 {
return "", fmt.Errorf("fabric control runtime has no bootstrap endpoints")
}
var lastErr error
for _, endpoint := range candidates {
response, err := m.controlRequestToEndpoint(transport, cfg, endpoint, payloadJSON)
if err != nil {
lastErr = err
continue
}
if strings.TrimSpace(endpoint.Address) != "" && strings.TrimSpace(endpoint.Address) != endpointAddress {
m.mu.Lock()
m.endpoint = strings.TrimSpace(endpoint.Address)
m.mu.Unlock()
}
return response, nil
}
if lastErr != nil {
return "", lastErr
}
return "", fmt.Errorf("fabric control route unavailable")
}
func prioritizeControlEndpoints(endpoints []endpointConfig, activeAddress string) []endpointConfig {
activeAddress = strings.TrimSpace(activeAddress)
out := make([]endpointConfig, 0, len(endpoints)+1)
seen := map[string]bool{}
for _, endpoint := range endpoints {
address := strings.TrimSpace(endpoint.Address)
if address == "" || address != activeAddress {
continue
}
out = append(out, endpoint)
seen[address] = true
}
for _, endpoint := range endpoints {
address := strings.TrimSpace(endpoint.Address)
if address == "" || seen[address] {
continue
}
out = append(out, endpoint)
seen[address] = true
}
if len(out) == 0 && activeAddress != "" {
out = append(out, endpointConfig{Address: activeAddress})
}
return out
}
func (m *Manager) controlRequestToEndpoint(transport *mesh.QUICFabricTransport, cfg runtimeConfig, endpoint endpointConfig, payloadJSON string) (string, error) {
target := mesh.FabricTransportTarget{
EndpointID: firstNonEmpty(endpoint.EndpointID, endpoint.Address),
PeerID: firstNonEmpty(endpoint.NodeID, cfg.ExitNodeID),
PeerID: firstNonEmpty(endpoint.NodeID, fabricRuntimeTargetNodeID(cfg)),
Endpoint: endpoint.Address,
Transport: firstNonEmpty(endpoint.Transport, "direct_quic"),
PeerCertSHA256: firstNonEmpty(endpoint.PeerCertSHA256, endpoint.TLSCertSHA256),
@@ -479,6 +828,130 @@ func (m *Manager) Reconnect() error {
return m.reconnectLocked()
}
func (m *Manager) UpdateRuntimeConfig(configJSON string) error {
var next runtimeConfig
if err := json.Unmarshal([]byte(configJSON), &next); err != nil {
return err
}
next.ClusterID = strings.TrimSpace(next.ClusterID)
next.LocalNodeID = strings.TrimSpace(next.LocalNodeID)
next.TunnelID = strings.TrimSpace(next.TunnelID)
next.PoolID = strings.TrimSpace(next.PoolID)
next.ServiceID = strings.TrimSpace(next.ServiceID)
next.LocalServiceID = strings.TrimSpace(next.LocalServiceID)
next.RemoteServiceID = strings.TrimSpace(next.RemoteServiceID)
next.ServiceKind = strings.TrimSpace(next.ServiceKind)
next.ServiceClass = strings.TrimSpace(next.ServiceClass)
next.RouteLeaseID = strings.TrimSpace(firstNonEmpty(next.RouteLeaseID, next.RouteBundle.RouteLease.LeaseID))
next.RouteGeneration = strings.TrimSpace(firstNonEmpty(next.RouteGeneration, next.RouteBundle.RouteLease.Generation, next.RouteBundle.RouteLease.LeaseID))
next.Endpoints = fabricRuntimeEndpoints(next)
if next.StreamShards <= 0 {
next.StreamShards = defaultRuntimeStreamShards
}
if next.StreamShards > maxRuntimeStreamShards {
next.StreamShards = maxRuntimeStreamShards
}
m.opMu.Lock()
defer m.opMu.Unlock()
m.mu.Lock()
current := m.cfg
packet := m.packet
m.mu.Unlock()
if current.TunnelID != "" && next.TunnelID != "" && current.TunnelID != next.TunnelID {
return fmt.Errorf("fabric runtime config tunnel id changed from %q to %q", current.TunnelID, next.TunnelID)
}
if next.ClusterID == "" {
next.ClusterID = current.ClusterID
}
if next.LocalNodeID == "" {
next.LocalNodeID = current.LocalNodeID
}
if next.TunnelID == "" {
next.TunnelID = current.TunnelID
}
if next.PoolID == "" {
next.PoolID = current.PoolID
}
if next.ServiceID == "" {
next.ServiceID = current.ServiceID
}
if next.ServiceKind == "" {
next.ServiceKind = current.ServiceKind
}
if next.ServiceClass == "" {
next.ServiceClass = current.ServiceClass
}
if len(next.Endpoints) == 0 {
next.Endpoints = current.Endpoints
}
reconnectForRoute := shouldReconnectForRuntimeRoute(current, next)
if packet != nil {
if _, err := packet.UpdateServiceTunnel(serviceTunnelFromRuntimeConfig(next)); err != nil {
return err
}
}
m.mu.Lock()
m.cfg = next
m.lastErr = ""
m.mu.Unlock()
if reconnectForRoute {
if err := m.reconnectLocked(); err != nil {
return err
}
}
return nil
}
func shouldReconnectForRuntimeRoute(current runtimeConfig, next runtimeConfig) bool {
if current.TunnelID == "" || next.TunnelID == "" || current.TunnelID != next.TunnelID {
return false
}
if fabricRuntimeTargetNodeID(current) != fabricRuntimeTargetNodeID(next) {
return true
}
return endpointListSignature(current.Endpoints) != endpointListSignature(next.Endpoints)
}
func endpointListSignature(endpoints []endpointConfig) string {
if len(endpoints) == 0 {
return ""
}
var b strings.Builder
for _, endpoint := range endpoints {
b.WriteString(endpoint.EndpointID)
b.WriteByte('|')
b.WriteString(endpoint.NodeID)
b.WriteByte('|')
b.WriteString(endpoint.Transport)
b.WriteByte('|')
b.WriteString(endpoint.Address)
b.WriteByte('|')
b.WriteString(endpoint.PeerCertSHA256)
b.WriteByte('|')
b.WriteString(endpoint.TLSCertSHA256)
b.WriteByte('|')
b.WriteString(fmt.Sprintf("%d", endpoint.Priority))
b.WriteByte('\n')
}
return b.String()
}
func (m *Manager) packetTransport() *vpnruntime.FabricSessionPacketTransport {
m.mu.Lock()
defer m.mu.Unlock()
return m.packet
}
func (m *Manager) reconnectPacketTransport() (*vpnruntime.FabricSessionPacketTransport, error) {
m.opMu.Lock()
defer m.opMu.Unlock()
if err := m.reconnectLocked(); err != nil {
return nil, err
}
return m.packetTransport(), nil
}
func (m *Manager) ensureConnectedLocked() error {
m.mu.Lock()
connected := m.packet != nil
@@ -498,11 +971,16 @@ func (m *Manager) reconnectLocked() error {
cfg := m.cfg
oldSession := m.session
oldTransport := m.transport
oldHeartbeatCancel := m.heartbeatCancel
cancel := m.cancel
m.session = nil
m.transport = nil
m.packet = nil
m.heartbeatCancel = nil
m.mu.Unlock()
if oldHeartbeatCancel != nil {
oldHeartbeatCancel()
}
if oldSession != nil {
_ = oldSession.Close()
}
@@ -521,27 +999,105 @@ func (m *Manager) reconnectLocked() error {
return nil
}
func decodePacketBatchPayload(payload []byte) ([][]byte, error) {
if len(payload) == 0 {
return nil, nil
}
packets := make([][]byte, 0, 16)
for offset := 0; offset < len(payload); {
if len(payload)-offset < 4 {
return nil, fmt.Errorf("invalid packet batch payload: truncated length")
}
size := int(binary.BigEndian.Uint32(payload[offset : offset+4]))
offset += 4
if size <= 0 || size > 65535 {
return nil, fmt.Errorf("invalid packet batch payload: packet size %d", size)
}
if len(payload)-offset < size {
return nil, fmt.Errorf("invalid packet batch payload: truncated packet")
}
packet := append([]byte(nil), payload[offset:offset+size]...)
packets = append(packets, packet)
offset += size
}
return packets, nil
}
func encodePacketBatchPayload(packets [][]byte) []byte {
if len(packets) == 0 {
return nil
}
total := 0
for _, packet := range packets {
if len(packet) == 0 {
continue
}
total += 4 + len(packet)
}
if total == 0 {
return nil
}
payload := make([]byte, 0, total)
var size [4]byte
for _, packet := range packets {
if len(packet) == 0 {
continue
}
binary.BigEndian.PutUint32(size[:], uint32(len(packet)))
payload = append(payload, size[:]...)
payload = append(payload, packet...)
}
return payload
}
func clonePacketBatch(packets [][]byte) [][]byte {
out := make([][]byte, 0, len(packets))
for _, packet := range packets {
if len(packet) == 0 {
continue
}
out = append(out, append([]byte(nil), packet...))
}
return out
}
func (m *Manager) SnapshotJSON() string {
m.mu.Lock()
connected := m.packet != nil
endpoint := m.endpoint
lastErr := m.lastErr
vpnConnectionID := m.cfg.VPNConnectionID
tunnelID := m.cfg.TunnelID
poolID := m.cfg.PoolID
serviceID := m.cfg.ServiceID
localNodeID := m.cfg.LocalNodeID
exitNodeID := m.cfg.ExitNodeID
serviceKind := m.cfg.ServiceKind
serviceClass := m.cfg.ServiceClass
routeLeaseID := m.cfg.RouteLeaseID
routeGeneration := m.cfg.RouteGeneration
var serviceStreamSnapshot map[string]any
if m.serviceStreams != nil {
serviceStreamSnapshot = m.serviceStreams.Snapshot()
}
m.mu.Unlock()
payload, _ := json.Marshal(map[string]any{
"schema_version": "rap.android_fabric_vpn_runtime.v1",
"schema_version": "rap.ipv4_tunnel_fabric_runtime.v1",
"platform_adapter": "android_vpnservice_tun",
"connected": connected,
"endpoint": endpoint,
"last_error": lastErr,
"vpn_connection": vpnConnectionID,
"tunnel_id": tunnelID,
"pool_id": poolID,
"service_id": serviceID,
"service_kind": serviceKind,
"service_class": serviceClass,
"route_lease_id": routeLeaseID,
"route_generation": routeGeneration,
"local_node_id": localNodeID,
"exit_node_id": exitNodeID,
"uplink_packets": m.uplinkPackets.Load(),
"uplink_bytes": m.uplinkBytes.Load(),
"downlink_packets": m.downlinkPackets.Load(),
"downlink_bytes": m.downlinkBytes.Load(),
"service_streams": serviceStreamSnapshot,
})
return string(payload)
}
@@ -560,15 +1116,23 @@ func openStreams(ctx context.Context, session mesh.FabricTransportSession, shard
classes := []struct {
name string
trafficClass fabricproto.TrafficClass
shards int
}{
{name: vpnruntime.FabricTrafficClassInteractive, trafficClass: fabricproto.TrafficClassInteractive},
{name: vpnruntime.FabricTrafficClassBulk, trafficClass: fabricproto.TrafficClassBulk},
{name: vpnruntime.FabricTrafficClassControl, trafficClass: fabricproto.TrafficClassControl, shards: 1},
{name: vpnruntime.FabricTrafficClassDNS, trafficClass: fabricproto.TrafficClassReliable, shards: 1},
{name: vpnruntime.FabricTrafficClassInteractive, trafficClass: fabricproto.TrafficClassInteractive, shards: shards},
{name: vpnruntime.FabricTrafficClassReliable, trafficClass: fabricproto.TrafficClassReliable, shards: maxInt(1, shards/2)},
{name: vpnruntime.FabricTrafficClassBulk, trafficClass: fabricproto.TrafficClassBulk, shards: shards},
{name: vpnruntime.FabricTrafficClassDroppable, trafficClass: fabricproto.TrafficClassDroppable, shards: maxInt(1, shards/2)},
}
out := make(map[string][]uint64, len(classes))
var primary uint64
var ordinal uint64
for classIndex, class := range classes {
for shard := 0; shard < shards; shard++ {
streamID := base + uint64(classIndex*shards+shard)
_ = classIndex
for shard := 0; shard < class.shards; shard++ {
ordinal++
streamID := base + ordinal
if err := session.Send(ctx, fabricproto.Frame{Type: fabricproto.FrameOpenStream, StreamID: streamID, TrafficClass: class.trafficClass}); err != nil {
return nil, 0, err
}
@@ -581,6 +1145,29 @@ func openStreams(ctx context.Context, session mesh.FabricTransportSession, shard
return out, primary, nil
}
func serviceTunnelFromRuntimeConfig(cfg runtimeConfig) vpnruntime.FabricServiceTunnel {
return vpnruntime.NormalizeServiceTunnel(vpnruntime.FabricServiceTunnel{
TunnelID: cfg.TunnelID,
PoolID: cfg.PoolID,
ServiceID: cfg.ServiceID,
LocalServiceID: cfg.LocalServiceID,
RemoteServiceID: cfg.RemoteServiceID,
ServiceKind: cfg.ServiceKind,
ServiceClass: cfg.ServiceClass,
ServiceRole: vpnruntime.DefaultFabricTunnelRole,
RouteLeaseID: cfg.RouteLeaseID,
RouteGeneration: cfg.RouteGeneration,
StreamShards: cfg.StreamShards,
}, cfg.TunnelID)
}
func maxInt(left, right int) int {
if left > right {
return left
}
return right
}
func firstNonEmpty(values ...string) string {
for _, value := range values {
if strings.TrimSpace(value) != "" {
@@ -3,11 +3,14 @@ package fabricvpn
import (
"os"
"testing"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/vpnruntime"
)
func TestFabricRuntimeEndpointsPreferRouteBundle(t *testing.T) {
cfg := runtimeConfig{
Endpoints: []endpointConfig{{EndpointID: "legacy", Address: "quic://legacy.example:19131"}},
Endpoints: []endpointConfig{{EndpointID: "compat", Address: "quic://compat.example:19131"}},
RouteBundle: routeBundleConfig{
EndpointCandidates: []endpointConfig{{EndpointID: "bundle", Address: "quic://bundle.example:19131"}},
},
@@ -20,7 +23,7 @@ func TestFabricRuntimeEndpointsPreferRouteBundle(t *testing.T) {
func TestFabricRuntimeEndpointsPreferRouteLease(t *testing.T) {
cfg := runtimeConfig{
Endpoints: []endpointConfig{{EndpointID: "legacy", Address: "quic://legacy.example:19131"}},
Endpoints: []endpointConfig{{EndpointID: "compat", Address: "quic://compat.example:19131"}},
RouteBundle: routeBundleConfig{
EndpointCandidates: []endpointConfig{{EndpointID: "bundle", Address: "quic://bundle.example:19131"}},
RouteLease: routeLeaseConfig{
@@ -41,13 +44,148 @@ func TestFabricRuntimeEndpointsPreferRouteLease(t *testing.T) {
}
}
func TestFabricRuntimeEndpointsFallbackToLegacyEndpoints(t *testing.T) {
func TestFabricRuntimePacketTargetIsLongLived(t *testing.T) {
cfg := runtimeConfig{
Endpoints: []endpointConfig{{EndpointID: "legacy", Address: "quic://legacy.example:19131"}},
RouteBundle: routeBundleConfig{RouteLease: routeLeaseConfig{
PrimaryPath: routeLeasePath{TargetNodeID: "exit-1"},
}},
}
target := fabricRuntimePacketTarget(cfg, endpointConfig{
EndpointID: "exit-public",
NodeID: "exit-1",
Address: "quic://203.0.113.10:19131",
Transport: "direct_quic",
PeerCertSHA256: "abc123",
})
if target.Timeout != 0 {
t.Fatalf("packet target timeout = %s, want 0 for long-lived vpn stream", target.Timeout)
}
if target.PeerID != "exit-1" || target.Endpoint != "quic://203.0.113.10:19131" || target.PeerCertSHA256 != "abc123" {
t.Fatalf("unexpected packet target: %+v", target)
}
}
func TestServiceTunnelFromRuntimeConfigCarriesRouteEpoch(t *testing.T) {
cfg := runtimeConfig{
TunnelID: "tunnel-1",
PoolID: "home-ipv4",
ServiceID: "svc-1",
ServiceKind: "ipv4-tunnel",
ServiceClass: "vpn_packets",
RouteLeaseID: "lease-1",
RouteGeneration: "route-gen-1",
StreamShards: 8,
}
tunnel := serviceTunnelFromRuntimeConfig(cfg)
if tunnel.RouteLeaseID != "lease-1" || tunnel.RouteGeneration != "route-gen-1" || tunnel.StreamShards != 8 {
t.Fatalf("service tunnel route epoch = %+v", tunnel)
}
}
func TestManagerUpdateRuntimeConfigKeepsTunnelAndUpdatesRouteEpoch(t *testing.T) {
manager := NewManager()
manager.cfg = runtimeConfig{
ClusterID: "cluster-1",
LocalNodeID: "android-1",
TunnelID: "tunnel-1",
PoolID: "home-ipv4",
ServiceID: "svc-1",
ServiceKind: "ipv4-tunnel",
ServiceClass: "vpn_packets",
RouteLeaseID: "lease-1",
RouteGeneration: "route-gen-1",
StreamShards: 4,
}
manager.packet = &vpnruntime.FabricSessionPacketTransport{
TunnelID: "tunnel-1",
ServiceTunnel: vpnruntime.FabricServiceTunnel{
TunnelID: "tunnel-1",
PoolID: "home-ipv4",
ServiceID: "svc-1",
RouteLeaseID: "lease-1",
RouteGeneration: "route-gen-1",
},
}
err := manager.UpdateRuntimeConfig(`{
"cluster_id":"cluster-1",
"local_node_id":"android-1",
"tunnel_id":"tunnel-1",
"pool_id":"home-ipv4",
"service_id":"svc-1",
"service_kind":"ipv4-tunnel",
"service_class":"vpn_packets",
"route_lease_id":"lease-2",
"route_generation":"route-gen-2",
"stream_shards":4,
"service_channel_request":{"schema_version":"rap.fabric_service_channel_request.v1"}
}`)
if err != nil {
t.Fatalf("update runtime config: %v", err)
}
snapshot := manager.packet.Snapshot()
if snapshot["route_lease_id"] != "lease-2" || snapshot["route_generation"] != "route-gen-2" || snapshot["route_transition_count"] != uint64(1) {
t.Fatalf("packet route epoch not updated: %+v", snapshot)
}
if err := manager.UpdateRuntimeConfig(`{"tunnel_id":"other-tunnel"}`); err == nil {
t.Fatal("expected changed tunnel id to be rejected")
}
}
func TestRuntimeRouteReconnectDecisionTracksTargetAndEndpoints(t *testing.T) {
current := runtimeConfig{
TunnelID: "tunnel-1",
Endpoints: []endpointConfig{{EndpointID: "exit-a", NodeID: "node-a", Address: "quic://node-a:19131", Transport: "direct_quic"}},
RouteBundle: routeBundleConfig{RouteLease: routeLeaseConfig{
PrimaryPath: routeLeasePath{TargetNodeID: "node-a"},
}},
}
sameLeaseNewGeneration := current
sameLeaseNewGeneration.RouteLeaseID = "lease-2"
sameLeaseNewGeneration.RouteGeneration = "route-gen-2"
if shouldReconnectForRuntimeRoute(current, sameLeaseNewGeneration) {
t.Fatal("same target/endpoints should update route epoch without reconnect")
}
newTarget := current
newTarget.RouteBundle.RouteLease.PrimaryPath.TargetNodeID = "node-b"
if !shouldReconnectForRuntimeRoute(current, newTarget) {
t.Fatal("changed target node should reconnect fabric session")
}
newEndpoint := current
newEndpoint.Endpoints = []endpointConfig{{EndpointID: "exit-b", NodeID: "node-b", Address: "quic://node-b:19131", Transport: "direct_quic"}}
if !shouldReconnectForRuntimeRoute(current, newEndpoint) {
t.Fatal("changed endpoint candidates should reconnect fabric session")
}
}
func TestPacketBatchSendTimeoutScalesWithPayload(t *testing.T) {
small := packetBatchSendTimeout([][]byte{make([]byte, 1200)})
large := packetBatchSendTimeout([][]byte{make([]byte, 4*1024*1024)})
if small != minPacketBatchSendTimeout {
t.Fatalf("small timeout = %s, want %s", small, minPacketBatchSendTimeout)
}
if large <= small {
t.Fatalf("large timeout = %s, want greater than %s", large, small)
}
many := make([][]byte, 2048)
for i := range many {
many[i] = make([]byte, 1200)
}
if got := packetBatchSendTimeout(many); got <= small {
t.Fatalf("many-packet timeout = %s, want greater than %s", got, small)
}
if got := packetBatchSendTimeout([][]byte{make([]byte, 100*1024*1024)}); got != maxPacketBatchSendTimeout {
t.Fatalf("capped timeout = %s, want %s", got, maxPacketBatchSendTimeout)
}
}
func TestFabricRuntimeEndpointsFallbackToDisallowedEndpoints(t *testing.T) {
cfg := runtimeConfig{
Endpoints: []endpointConfig{{EndpointID: "compat", Address: "quic://compat.example:19131"}},
}
got := fabricRuntimeEndpoints(cfg)
if len(got) != 1 || got[0].EndpointID != "legacy" {
t.Fatalf("endpoints = %+v, want legacy endpoint fallback", got)
if len(got) != 1 || got[0].EndpointID != "compat" {
t.Fatalf("endpoints = %+v, want compat endpoint fallback", got)
}
}
@@ -76,18 +214,18 @@ func TestLiveFabricVPNRuntimeStartsFromRouteLease(t *testing.T) {
t.Fatalf("receive live dns packet: %v", err)
}
if len(packet) > 0 {
if packet[9] != 17 || packet[12] != 1 || packet[13] != 1 || packet[14] != 1 || packet[15] != 1 {
t.Fatalf("unexpected response packet header: %v", packet[:min(20, len(packet))])
if len(packet) >= 20 && packet[9] == 17 && packet[12] == 1 && packet[13] == 1 && packet[14] == 1 && packet[15] == 1 {
return
}
return
}
}
t.Fatal("timed out waiting for live dns response through fabric vpn")
}
func testDNSIPv4Packet() []byte {
nonce := uint16(time.Now().UnixNano())
dns := []byte{
0x12, 0x34, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00,
byte(nonce >> 8), byte(nonce), 0x01, 0x00, 0x00, 0x01, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x07, 'e', 'x', 'a',
'm', 'p', 'l', 'e', 0x03, 'c', 'o', 'm', 0x00,
0x00, 0x01, 0x00, 0x01,
@@ -102,8 +240,8 @@ func testDNSIPv4Packet() []byte {
packet[9] = 17
copy(packet[12:16], []byte{10, 77, 0, 2})
copy(packet[16:20], []byte{1, 1, 1, 1})
packet[20] = 0xcf
packet[21] = 0x08
packet[20] = byte(0xc0 | ((nonce >> 8) & 0x3f))
packet[21] = byte(nonce)
packet[22] = 0x00
packet[23] = 0x35
packet[24] = byte(udpLen >> 8)
Binary file not shown.