Initial project snapshot
This commit is contained in:
@@ -0,0 +1,22 @@
|
||||
FROM debian:12.10-slim
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
bash \
|
||||
build-essential \
|
||||
ca-certificates \
|
||||
cmake \
|
||||
curl \
|
||||
gdb \
|
||||
git \
|
||||
freerdp2-dev \
|
||||
libboost-dev \
|
||||
libssl-dev \
|
||||
libwinpr2-dev \
|
||||
ninja-build \
|
||||
pkg-config \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /workspaces/rdp-proxy
|
||||
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"name": "rdp-proxy-worker",
|
||||
"build": {
|
||||
"dockerfile": "Dockerfile",
|
||||
"context": ".."
|
||||
},
|
||||
"workspaceFolder": "/workspaces/rdp-proxy",
|
||||
"customizations": {
|
||||
"vscode": {
|
||||
"settings": {
|
||||
"cmake.sourceDirectory": "${workspaceFolder}/workers/rdp-worker",
|
||||
"cmake.buildDirectory": "${workspaceFolder}/workers/rdp-worker/build"
|
||||
},
|
||||
"extensions": [
|
||||
"ms-vscode.cmake-tools",
|
||||
"ms-vscode.cpptools"
|
||||
]
|
||||
}
|
||||
},
|
||||
"postCreateCommand": "cmake --preset dev --directory workers/rdp-worker",
|
||||
"remoteUser": "root"
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
name: build
|
||||
|
||||
on:
|
||||
push:
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
backend:
|
||||
runs-on: ubuntu-24.04
|
||||
defaults:
|
||||
run:
|
||||
working-directory: backend
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.23.8'
|
||||
- run: go mod download
|
||||
- run: go build ./...
|
||||
|
||||
worker:
|
||||
runs-on: ubuntu-24.04
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Build worker image
|
||||
run: docker build --tag rap-rdp-worker:ci --file workers/rdp-worker/Dockerfile workers/rdp-worker
|
||||
- name: Verify worker binary path
|
||||
run: docker run --rm --entrypoint /bin/sh rap-rdp-worker:ci -lc "test -x /usr/local/bin/rdp-worker && test -x /usr/local/bin/rdp-worker-dataplane-token-probe && test -x /usr/local/bin/rdp-worker-dataplane-bind-probe"
|
||||
+64
@@ -0,0 +1,64 @@
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
logs/
|
||||
tmp/
|
||||
temp/
|
||||
|
||||
# Env
|
||||
.env
|
||||
.env.*
|
||||
!.env.example
|
||||
|
||||
# Go
|
||||
backend/bin/
|
||||
backend/.cache/
|
||||
backend/vendor/
|
||||
*.test
|
||||
coverage.out
|
||||
|
||||
# C/C++
|
||||
build/
|
||||
cmake-build-*/
|
||||
CMakeFiles/
|
||||
CMakeCache.txt
|
||||
compile_commands.json
|
||||
*.o
|
||||
*.obj
|
||||
*.so
|
||||
*.a
|
||||
*.dll
|
||||
*.exe
|
||||
|
||||
# .NET / WPF
|
||||
bin/
|
||||
obj/
|
||||
*.user
|
||||
*.suo
|
||||
|
||||
# Node / React
|
||||
node_modules/
|
||||
dist/
|
||||
build/
|
||||
.next/
|
||||
coverage/
|
||||
|
||||
# Python (if scripts appear later)
|
||||
__pycache__/
|
||||
*.pyc
|
||||
|
||||
# Docker
|
||||
*.local.yml
|
||||
|
||||
# Generated artifacts
|
||||
artifacts/
|
||||
out/
|
||||
@@ -0,0 +1,618 @@
|
||||
# CODEX CONTEXT
|
||||
|
||||
## Project identity
|
||||
|
||||
This project is a production-grade distributed secure access platform.
|
||||
|
||||
It started as a custom RDP proxy with persistent server-side sessions, but the final target architecture is broader:
|
||||
|
||||
- distributed secure access fabric
|
||||
- multi-tenant platform
|
||||
- session broker for GUI and future non-GUI protocols
|
||||
- cluster mesh of nodes
|
||||
- connector/VPN layer
|
||||
- customer-managed and platform-managed nodes
|
||||
- node-agent based self-update / rollback / health supervision
|
||||
|
||||
## Current proven foundation
|
||||
|
||||
The current codebase already proved the most risky low-level lifecycle assumptions for RDP:
|
||||
|
||||
- real FreeRDP connect works
|
||||
- session state transitions to active work
|
||||
- terminate works
|
||||
- detach works without killing the remote session
|
||||
- reattach works without recreating the remote session
|
||||
- takeover works without recreating the remote session
|
||||
- per-resource certificate verification policy exists
|
||||
- `certificate_verification_mode = strict | ignore`
|
||||
- `strict` is default
|
||||
- `ignore` works on a per-resource basis
|
||||
- worker build is reproducible
|
||||
- backend build is reproducible
|
||||
|
||||
This proven lifecycle must NOT be broken by future architecture work.
|
||||
|
||||
## Current architecture baseline
|
||||
|
||||
Current audit and baseline snapshot:
|
||||
|
||||
- `docs/audits/PROJECT_AUDIT_2026-04-26.md`
|
||||
- `docs/audits/CURRENT_BASELINE_MATRIX.md`
|
||||
|
||||
### Test environment
|
||||
- Canonical test Docker host: `192.168.200.61`
|
||||
- Canonical Docker context: `test-ubuntu`
|
||||
- Canonical SSH alias: `docker-test`
|
||||
- Backend API for local/client smoke runs: `http://192.168.200.61:8080/api/v1`
|
||||
- WebSocket gateway for local/client smoke runs: `ws://192.168.200.61:8080/api/v1/gateway/ws`
|
||||
- Stage C17 planning is completed.
|
||||
- C17A synthetic mesh runtime skeleton is implemented and test-proven in
|
||||
`rap-node-agent` only. It is disabled by default and carries synthetic
|
||||
`fabric.probe` / `fabric.probe_ack` messages only.
|
||||
- C17B route health and failover probes are implemented and test-proven in
|
||||
`rap-node-agent` only. They are disabled by default and carry synthetic
|
||||
`fabric.route_health` / `fabric.route_health_ack` messages only.
|
||||
- C17C relay semantic hardening is implemented and test-proven in
|
||||
`rap-node-agent` only. It is disabled by default and models synthetic
|
||||
per-channel queues/QoS/backpressure only.
|
||||
- C17D non-production test-service path is implemented and test-proven in
|
||||
`rap-node-agent` only. It is disabled by default and carries only bounded
|
||||
`synthetic.echo` test payloads.
|
||||
- C17E/C17F/C17G are implemented and proven for live synthetic HTTP transport,
|
||||
scoped synthetic route config, and Control Plane scoped synthetic config
|
||||
consumption.
|
||||
- C17H deployed multi-agent synthetic config smoke is runtime-proven on
|
||||
`docker-test`: five running `rap-node-agent` containers consume
|
||||
backend-issued node-scoped synthetic config, direct and single-relay
|
||||
synthetic route-health observations return to the Control Plane, and
|
||||
production forwarding remains disabled.
|
||||
- C17I production forwarding gate foundation is implemented and test-proven:
|
||||
`rap-node-agent` has an explicit production-forwarding gate, while
|
||||
`/mesh/v1/forward` still refuses production payload forwarding until a later
|
||||
approved runtime stage.
|
||||
- C17J production envelope contract is implemented and test-proven:
|
||||
`/mesh/v1/forward` validates route-bound production envelopes for
|
||||
`fabric_control` / `fabric.control` only when the gate is enabled, rejects
|
||||
service channels, and still refuses production forwarding.
|
||||
- C17K production envelope observation is implemented and test-proven:
|
||||
valid accepted envelopes can be observed locally as metadata-only records
|
||||
after validation; rejected envelopes are not observed, observation failure
|
||||
fails closed, and production forwarding remains unavailable.
|
||||
- C17L bounded production observation sink is implemented and test-proven:
|
||||
accepted metadata-only observations can be retained locally with fixed
|
||||
capacity, oldest-entry drop behavior, and no payload body storage.
|
||||
- C17M production observation sink wiring is implemented and test-proven:
|
||||
node-agent can wire the bounded local metadata-only sink when
|
||||
`RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY` is explicitly greater than
|
||||
zero; the wiring is disabled by default and exposes no read API.
|
||||
- C17N production observation sink metrics are implemented and test-proven:
|
||||
local sink metrics expose only capacity, current depth, accepted total, and
|
||||
dropped-oldest total; they expose no observation records or payload metadata.
|
||||
- C17O production observation sink local metrics logging is implemented and
|
||||
test-proven: node-agent logs aggregate sink metrics locally when the sink is
|
||||
explicitly enabled; no read API or Control Plane reporting is added.
|
||||
- C17P production observation sink change-driven metrics logging is implemented
|
||||
and test-proven: node-agent suppresses repeated identical local sink metrics
|
||||
logs; no read API or Control Plane reporting is added.
|
||||
- C17Q production forwarding gate/runtime log boundary is implemented and
|
||||
test-proven: node-agent logs production forwarding gate state separately from
|
||||
production forwarding runtime state. Runtime state remained false until
|
||||
C17Z introduced gate-controlled `fabric.control` direct forwarding.
|
||||
- C17R production observation sink capacity guard is implemented and
|
||||
test-proven: `RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY` is rejected
|
||||
above `10000`.
|
||||
- C17S production observation panic fail-closed hardening is implemented and
|
||||
test-proven: observer errors and observer panics both fail closed as
|
||||
observation failure.
|
||||
- C17T production envelope payload boundary is implemented and test-proven:
|
||||
validated production `fabric.control` envelope payloads are bounded to
|
||||
`4096` bytes and oversized envelopes are rejected before observation.
|
||||
- C17U production envelope created-at skew boundary is implemented and
|
||||
test-proven: validated production `fabric.control` envelopes whose
|
||||
`created_at` is more than one minute in the future are rejected before
|
||||
observation.
|
||||
- C17V peer endpoint candidate model is implemented and test-proven:
|
||||
node-scoped synthetic mesh config now carries route-scoped endpoint
|
||||
candidates with transport, address, reachability, NAT type, connectivity
|
||||
mode, priority, policy tags, verification time, and metadata. This is a
|
||||
model/config boundary only; no production route scoring, NAT traversal,
|
||||
shortcut routing, or forwarding runtime is implemented.
|
||||
- C17W peer endpoint candidate scoring model is implemented and test-proven:
|
||||
`rap-node-agent` can rank already-scoped endpoint candidates using soft
|
||||
inputs such as transport, reachability, connectivity mode, NAT type,
|
||||
priority, region, policy tags, channel class, and verification age. This is
|
||||
a scoring helper only; it does not open connections, choose production
|
||||
routes, or forward payloads.
|
||||
- C17X health-aware endpoint candidate scoring overlay is implemented and
|
||||
test-proven: endpoint candidate scoring can optionally use local health
|
||||
observations keyed by `endpoint_id`, including latency, success/failure
|
||||
history, recent failure reason, reliability score, and observation freshness.
|
||||
This remains advisory scoring only and is not wired into production route
|
||||
execution.
|
||||
- C17Y Platform Owner synthetic mesh visibility is implemented and
|
||||
build/test-proven: `web-admin` reads node-scoped synthetic mesh config and
|
||||
shows config enabled state, route counts, peer endpoints, endpoint
|
||||
candidates, C17X advisory scoring boundary, and `production_forwarding`.
|
||||
This remains platform-owner visibility only and does not enable production
|
||||
forwarding.
|
||||
- C17Z production fabric-control direct forwarding boundary is implemented and
|
||||
test-proven: when `RAP_MESH_PRODUCTION_FORWARDING_ENABLED=true`,
|
||||
`/mesh/v1/forward` can deliver valid route-bound `fabric.control` envelopes
|
||||
at the local destination or forward them to a direct next hop from explicit
|
||||
peer endpoint config. Service channels, arbitrary relay forwarding,
|
||||
multi-hop production route execution, and RDP/VPN/file/video/service payloads
|
||||
remain unavailable.
|
||||
- C17Z1 production fabric-control multi-hop route-path boundary is implemented
|
||||
and test-proven: production `fabric.control` envelopes can carry
|
||||
`route_path` and `visited_node_ids`; relay nodes validate path position,
|
||||
forward only to the next path node, update TTL/hop/visited metadata, and
|
||||
reject loops. Service payloads remain unavailable.
|
||||
- C17Z2 production fabric-control forwarding observability boundary is
|
||||
implemented and test-proven: node-agent emits local
|
||||
`mesh_production_forward_event` logs for accepted, forwarded, delivered, and
|
||||
rejected production `fabric.control` envelopes. Logs are metadata-only and
|
||||
include no payload bodies or read API.
|
||||
- C17Z3 production fabric-control route-config boundary is implemented and
|
||||
test-proven: when scoped/control-plane mesh routes are available locally,
|
||||
production `fabric.control` envelopes must match configured route_id/path/
|
||||
next-hop/channel/expiry/TTL/hop limits before forwarding.
|
||||
- C17Z4 scoped peer directory and recovery seeds boundary is implemented and
|
||||
test/build-proven: node-scoped mesh config carries scoped `peer_directory`
|
||||
and explicit bounded `recovery_seeds`; node-agent parses/validates them and
|
||||
web-admin shows counts.
|
||||
- C17Z5 node-agent peer cache runtime boundary is implemented and test-proven:
|
||||
node-agent builds a local `PeerCache`, selects bounded warm peers, probes warm
|
||||
peers with `/mesh/v1/health`, and reports metadata-only mesh-link
|
||||
observations when synthetic mesh testing is enabled.
|
||||
- C17Z6 dynamic endpoint reporting boundary is implemented and test-proven:
|
||||
node-agent reports explicit advertised mesh endpoint metadata in heartbeat,
|
||||
and Control Plane projects latest reported endpoints/candidates into
|
||||
node-scoped synthetic mesh config.
|
||||
- C17Z7 private/corporate endpoint candidate boundary is implemented and
|
||||
test-proven: node-agent reports multiple advertised endpoint candidates,
|
||||
scoring rewards private/corporate same-site candidates, and peer cache can
|
||||
use the best candidate address for warm health.
|
||||
- C17Z8 peer connection state machine boundary is implemented and test-proven:
|
||||
node-agent tracks warm-peer states `disconnected`, `connecting`, `ready`,
|
||||
`degraded`, and `backoff`, with bounded backoff after repeated health probe
|
||||
failures.
|
||||
- C17Z9 peer recovery planner boundary is implemented and test-proven:
|
||||
node-agent targets a bounded stable ready-peer set, enters recovery when
|
||||
ready peers fall below target, and selects bounded recovery probes from warm
|
||||
peers, recovery seeds, and other connectable scoped peers.
|
||||
- C17Z10 peer connection intent planner boundary is implemented and
|
||||
test-proven: node-agent classifies bounded peer work as maintain/probe/
|
||||
recover and classifies transport readiness as direct/private_lan/
|
||||
corporate_lan/outbound_only/relay_required, with rendezvous-required
|
||||
metadata only.
|
||||
- C17Z11 peer connection manager runtime boundary is implemented and
|
||||
test-proven: node-agent uses a reusable HTTP keep-alive client for real
|
||||
control-plane health probes of direct/private/corporate peers and records
|
||||
`waiting_rendezvous` for outbound-only/relay-required peers.
|
||||
- C17Z12 rendezvous/relay control-plane contract is implemented and
|
||||
docker-test-runtime-proven: backend issues node-scoped `rendezvous_leases`,
|
||||
node-agent resolves matching `waiting_rendezvous` intents into
|
||||
`relay_control`, probes relay `/mesh/v1/health`, records and maintains
|
||||
`relay_ready`, and keeps service payload forwarding disabled.
|
||||
- C17Z13 rendezvous lease telemetry is implemented and
|
||||
docker-test-runtime-proven: node-agent reports
|
||||
`mesh_rendezvous_lease_report` with relay admission, peer admission,
|
||||
TTL/renewal posture, `relay_ready`, and explicit no-payload boundary flags;
|
||||
web-admin shows `rv leases` in recent heartbeat tables.
|
||||
- C17Z14 rendezvous lease refresh contract is implemented and
|
||||
docker-test-runtime-proven: node-agent refreshes renewal-needed/stale
|
||||
rendezvous leases through node-scoped synthetic config reload, updates the
|
||||
running peer cache/route/lease state, and reports refresh plus stale relay
|
||||
withdrawal/reselection telemetry. Service payload forwarding remains
|
||||
unavailable.
|
||||
- C17Z15 backend relay replacement policy is implemented and
|
||||
docker-test-runtime-proven: backend consumes recent stale-relay heartbeat
|
||||
feedback, withdraws stale explicit rendezvous leases, scores alternate relay
|
||||
candidates from route adjacency, endpoint priority, policy tags, and recent
|
||||
mesh-link health, and returns replacement leases plus
|
||||
`rendezvous_relay_policy` decisions in node-scoped synthetic config.
|
||||
Node-agent reports `c17z15.mesh_rendezvous_lease_report.v1` and keeps stale
|
||||
state scoped to the exact lease/relay, so replacement leases for the same
|
||||
peer are not marked stale by association. Service payload forwarding remains
|
||||
unavailable.
|
||||
- C17Z16 route/path decision artifact is implemented and
|
||||
docker-test-runtime-proven: backend `c17z16.synthetic.v1` config includes
|
||||
`route_path_decisions` with original hops, effective hops, local previous/
|
||||
next hop, selected replacement relay, generation, score reasons, and
|
||||
no-payload boundary flags. Node-agent stores the control-plane route
|
||||
generation and reports `c17z16.mesh_route_path_decision_report.v1` plus
|
||||
`c17z16.mesh_rendezvous_lease_report.v1`. Service payload forwarding remains
|
||||
unavailable.
|
||||
- C17Z17 node-side route generation tracker is implemented and
|
||||
docker-test-runtime-proven: backend `c17z17.synthetic.v1` config and
|
||||
node-agent `mesh_route_generation_report` track active/applied/unchanged/
|
||||
withdrawn route decisions, generation changes, total counters, and
|
||||
`withdrawn_by_replacement` records for stale relay paths when replacement is
|
||||
first observed. Service payload forwarding remains unavailable.
|
||||
- C17Z18 synthetic route-health effective path runtime is implemented and
|
||||
docker-test-runtime-proven: backend `c17z18.synthetic.v1` config and
|
||||
node-agent `mesh_route_health_config_report` apply Control Plane
|
||||
`route_path_decisions` to synthetic route-health route config only. The
|
||||
synthetic runtime probes selected effective paths through replacement relays,
|
||||
reports expected/observed hops and drift state, and backend latest mesh links
|
||||
preserve route-health observations separately from connection-manager
|
||||
observations. Service payload forwarding remains unavailable.
|
||||
- C17Z19 synthetic route-health feedback scoring is implemented and
|
||||
docker-test-runtime-proven: backend consumes recent `synthetic_route_health`
|
||||
observations in relay scoring, uses drift/unreachable/failure metadata to
|
||||
mark the exact selected relay stale, boosts healthy low-latency relay
|
||||
candidates, and returns replacement leases/route decisions through the
|
||||
existing synthetic config contract. Migration `000022` adds the `synthetic`
|
||||
mesh service class. Service payload forwarding remains unavailable.
|
||||
- C17Z20 node-side route-health feedback refresh is implemented and
|
||||
docker-test-runtime-proven: after reporting synthetic route-health
|
||||
drift/unreachable/failure, node-agent performs a bounded node-scoped
|
||||
synthetic-config refresh, applies returned replacement route decisions to
|
||||
route-health config immediately, and reports
|
||||
`c17z20.mesh_route_health_feedback_refresh_report.v1`. Service payload
|
||||
forwarding remains unavailable.
|
||||
- Installation Authority foundation is implemented: production requires strict
|
||||
Product Root public key config, first-owner bootstrap uses signed Ed25519
|
||||
activation manifests, `installation_authority` and signed
|
||||
`platform_role_grants` are persisted, and strict platform-admin checks ignore
|
||||
direct `users.platform_role` database edits without a valid signed grant.
|
||||
Web-admin exposes installation status/first-owner bootstrap, and
|
||||
`scripts/installation/product-root-tool.go` generates keys/manifests for
|
||||
offline product-root operations.
|
||||
- Cluster Authority and node enrollment bootstrap are docker-test lifecycle
|
||||
smoke-proven in run `dev-bootstrap-20260428-201430`: a fresh dev install
|
||||
bootstrapped the first owner, created a cluster, issued a signed join token,
|
||||
accepted real `rap-node-agent` enrollment, owner-approved the join request,
|
||||
agent-polled signed bootstrap, persisted cluster authority pin, heartbeated,
|
||||
and verified signed `c17z18.synthetic.v1` Control Plane config. Production
|
||||
service payload forwarding remains unavailable.
|
||||
- Migration `000021_cluster_authority_keys` drops/recreates
|
||||
`cluster_admin_summaries` because fresh replay proved PostgreSQL cannot
|
||||
change that view layout via `CREATE OR REPLACE VIEW`.
|
||||
- `rap-node-agent` desired-workload polling/status reporting is gated by
|
||||
`RAP_WORKLOAD_SUPERVISION_ENABLED=false` by default while service runtime
|
||||
supervision remains a stub.
|
||||
- C18 VPN/IP tunnel service target design is completed as documentation only.
|
||||
- C18A VPN/IP tunnel control-plane data model foundation is implemented and
|
||||
backend-test-proven.
|
||||
- C18B VPN/IP tunnel lease/fencing hardening is implemented and
|
||||
backend-test-proven.
|
||||
- C18C VPN/IP tunnel node-agent desired-state consumption/reporting is
|
||||
implemented and backend-test-proven.
|
||||
- No next platform-core implementation step is automatically authorized after
|
||||
C17Z20. The next mesh layer should stay limited to route-health feedback
|
||||
refresh dampening/no-change cooldown unless the user explicitly chooses
|
||||
another staged task.
|
||||
- Latest RDP performance reference image:
|
||||
`rap-rdp-worker:rdp-perf6-dirty-region`
|
||||
- Stage 5.2 file-download runtime artifacts remain preserved for when RDP work
|
||||
resumes, but they are not the active next task.
|
||||
- Do not use `docker.cin.su` for this project unless explicitly requested for a separate one-off check.
|
||||
|
||||
### Backend
|
||||
- Go
|
||||
- PostgreSQL = source of truth
|
||||
- Redis = live coordination / routing only
|
||||
- REST for control plane
|
||||
- WebSocket for live session channel
|
||||
|
||||
### Worker
|
||||
- C++ worker
|
||||
- FreeRDP integration
|
||||
- worker runtime hides FreeRDP details from backend
|
||||
- The C++ worker remains the primary RDP runtime.
|
||||
- Target RDP performance direction: `docs/architecture/RDP_SERVICE_CPP_PERFORMANCE_TARGET.md`.
|
||||
- The RDP performance rewrite scope is limited to C++ RDP service adapter
|
||||
internals. It must not redesign backend control plane, cluster transport,
|
||||
organizations, leases, or session lifecycle.
|
||||
- The C# RDP service skeleton is inactive research scaffolding and is not the
|
||||
current runtime direction.
|
||||
- Current RDP Adapter baseline: RDP-Perf-6 dirty-region direct binary rendering
|
||||
is completed and smoke-proven on `docker-test`. RDP work is paused by product
|
||||
decision; next active work is Fabric Core / cluster foundation.
|
||||
- P3/P3.1 security-readiness foundation exists: production mode rejects
|
||||
plaintext credential-like resource metadata, requires `secret_ref` for
|
||||
RDP/VNC/SSH resources, and has an encrypted PostgreSQL-backed resource secret
|
||||
storage/resolver MVP. P3.2 direct-worker TLS/PKI guard exists.
|
||||
- P3.3 production-like test-stand smoke is complete on `docker-test`: backend
|
||||
runs in `APP_ENV=production` with a test-only secret key file, a secret-backed
|
||||
RDP resource starts real sessions through the resolver path, metadata/audit do
|
||||
not contain plaintext credentials, and backend gateway fallback remains
|
||||
available when direct worker WSS trust is `smoke_insecure`.
|
||||
- P3.4 production direct-worker WSS trust model is documented in
|
||||
`docs/architecture/PRODUCTION_DIRECT_WORKER_WSS_TRUST.md`; it defines
|
||||
platform CA/public CA behavior, worker certificate SAN/identity requirements,
|
||||
app-local Windows trust direction, rotation/revocation, and the future
|
||||
`platform_ca` smoke plan. No RDP runtime behavior changed in P3.4.
|
||||
- P3.5 app-local platform CA trust is implemented and runtime-proven on
|
||||
`docker-test`: Windows client validates direct worker WSS with an app-local
|
||||
platform CA bundle, keeps hostname/SAN validation enabled, selects
|
||||
`direct_worker_wss` without insecure TLS bypass, and falls back to backend
|
||||
gateway for unknown CA / smoke-only production cases.
|
||||
- P3.6 stale Redis worker/live event idempotency is implemented and
|
||||
runtime-proven: stale worker events for terminal PostgreSQL sessions are
|
||||
ignored, backend restart survives stale Redis events, and terminal sessions
|
||||
are not reopened.
|
||||
- Stage 5.2 server-to-client file download core data path is runtime-proven:
|
||||
direct worker WSS and backend gateway fallback both download text/binary
|
||||
files from `RAP_Transfers\ToClient` with matching size/hash, and direct
|
||||
policy blocking is proven for `disabled` and `client_to_server`. Lifecycle
|
||||
blocking is also runtime-proven for detach, old-client takeover, and worker
|
||||
failure. Runtime report:
|
||||
`artifacts/stage5-2-file-download-runtime-report.md`.
|
||||
- Stage 5.2 is not fully accepted yet. Remaining proof: Windows desktop UI
|
||||
download path and regression matrix for rendering/input/clipboard/upload/
|
||||
reconnect/takeover.
|
||||
|
||||
### Clients
|
||||
- future native clients:
|
||||
- Windows: native desktop client first
|
||||
- Linux: native desktop client later
|
||||
- web UI is admin/control plane, not the primary power-user client
|
||||
|
||||
## Final architecture direction
|
||||
|
||||
The long-term target architecture is documented in:
|
||||
|
||||
- `docs/architecture/SECURE_ACCESS_FABRIC_TARGET.md`
|
||||
- `docs/architecture/CLUSTER_NODE_ADMIN_FOUNDATION.md`
|
||||
- `docs/architecture/WEB_INGRESS_AND_ADMIN_UI_MODEL.md`
|
||||
|
||||
This document defines the target Secure Access Fabric architecture only. It is not the current implementation scope and must not be used as permission to start mesh, VPN, multi-cluster, updater, or realtime data-plane migration work without an explicit staged prompt.
|
||||
|
||||
`CLUSTER_NODE_ADMIN_FOUNDATION.md` defines the next platform-core planning
|
||||
baseline for clusters, node enrollment, native node-agent identity, platform
|
||||
admin console, multi-cluster administration, and future organization admin
|
||||
visibility. It is a staged foundation document, not permission to implement
|
||||
mesh packet routing or VPN runtime.
|
||||
|
||||
`WEB_INGRESS_AND_ADMIN_UI_MODEL.md` defines WEB as HTTP/HTTPS ingress and
|
||||
Admin UI presentation only. Cluster configuration remains Control Plane
|
||||
ownership through scoped APIs, PostgreSQL source-of-truth mutations, and audit.
|
||||
Dynamic pages must be safe schema-driven projections and must not embed
|
||||
internal topology, peer caches, route caches, secrets, raw credentials, or
|
||||
arbitrary executable code.
|
||||
|
||||
Admin endpoint placement is explicit. Fabric Storage / Config Storage nodes do
|
||||
not automatically host or move the cluster panel. Platform Owner Console
|
||||
remains global platform-owner scope. Cluster Admin Endpoint requires explicit
|
||||
admin/web ingress role assignment, cluster health/trust readiness, and Control
|
||||
Plane authorization. Organization Admin Panel remains a tenant-safe projection.
|
||||
|
||||
The final platform must support:
|
||||
|
||||
1. Multi-tenancy / Organizations
|
||||
- platform has many organizations
|
||||
- each organization has isolated users, groups, resources, policies, audit, connectors
|
||||
- users may belong to multiple organizations
|
||||
- organization admins only see their organization
|
||||
- platform admins see platform scope
|
||||
|
||||
2. Identity federation
|
||||
- local users
|
||||
- LDAP / Active Directory
|
||||
- OIDC
|
||||
- future extensibility for more identity sources
|
||||
- access mappings based on external groups / claims
|
||||
|
||||
3. Cluster of nodes
|
||||
- no mandatory single central node
|
||||
- many nodes across many sites
|
||||
- nodes can be platform-managed or customer-managed
|
||||
- customer-managed nodes are sandboxed cluster participants, not full cluster owners
|
||||
|
||||
4. Node agent
|
||||
- small stable always-running agent on every node
|
||||
- supervises services
|
||||
- downloads updates
|
||||
- verifies signed artifacts
|
||||
- can rollback to previous version
|
||||
- can restart crashed services
|
||||
- can work on thin or thick nodes
|
||||
|
||||
5. Service-based node model
|
||||
Each node is not monolithic.
|
||||
A node has:
|
||||
- capabilities: what it can do physically/technically
|
||||
- enabled services: what it is allowed/assigned to do
|
||||
|
||||
Possible services include:
|
||||
- ingress-gateway
|
||||
- mesh-router
|
||||
- relay
|
||||
- connector-host
|
||||
- vpn-adapter
|
||||
- session-worker
|
||||
- media-relay
|
||||
- file-relay
|
||||
- update-cache
|
||||
- config-replica
|
||||
- audit-sink
|
||||
- metrics-exporter
|
||||
|
||||
6. Cluster mesh and routing
|
||||
- encrypted inter-node communication
|
||||
- dynamic topology
|
||||
- no need for full mesh
|
||||
- multi-hop routing allowed
|
||||
- route failover
|
||||
- client failover between ingress nodes
|
||||
- connector failover between nodes
|
||||
|
||||
7. Split-brain prevention
|
||||
- quorum-based cluster behavior
|
||||
- minority partition must not become a second authoritative cluster
|
||||
- degraded / recovery / isolated modes
|
||||
- manual recovery / promote decision by platform recovery admin
|
||||
|
||||
8. Connector / VPN layer
|
||||
- connectors are reusable network access methods
|
||||
- one connector may be used by multiple resources
|
||||
- connector placement and failover are controlled by policy
|
||||
- nodes may be allowed or disallowed to host connectors
|
||||
- direct access, VPN, relay and future egress modes must fit this model
|
||||
|
||||
9. Future exit mode
|
||||
- split tunnel
|
||||
- full tunnel
|
||||
- internet access through cluster
|
||||
- not first implementation priority
|
||||
|
||||
## Non-negotiable design rules
|
||||
|
||||
- Do not rewrite proven session lifecycle carelessly.
|
||||
- Do not turn Redis into a source of truth.
|
||||
- Do not make certificate-ignore a global worker setting.
|
||||
- Do not make customer-managed nodes platform-wide trusted by default.
|
||||
- Do not create a separate cluster per organization.
|
||||
- Do not assume a single permanently reachable central node.
|
||||
- Do not rely on “secret protocol with no docs” as security.
|
||||
- Security must come from crypto, auth, isolation, policy and observability.
|
||||
- Prefer incremental evolution from current proven system.
|
||||
- Do not collapse platform control plane and data plane into one vague layer.
|
||||
|
||||
## Implementation strategy
|
||||
|
||||
The codebase must evolve in phases.
|
||||
|
||||
Current implementation focus remains:
|
||||
- RDP work is paused by product decision
|
||||
- preserve the accepted RDP Adapter baseline and Stage 5.x file-transfer work
|
||||
- do not delete or rewrite the current RDP MVP while platform-core work starts
|
||||
- C1-C9 platform-core foundations are implemented and verified: clusters,
|
||||
node enrollment, node-agent scaffold, platform admin console, workload
|
||||
supervision contract, mesh control-plane prep, mesh skeleton, multi-cluster
|
||||
hardening, and organization admin foundation
|
||||
- C10 Fabric Core configuration distribution design is completed
|
||||
- C11 signed scoped cluster snapshot model is completed
|
||||
- C12 node local state store is completed
|
||||
- C13 Fabric Storage / Config Storage service foundation is completed
|
||||
- C14 peer directory and cache model is completed
|
||||
- C15 Fabric Routing Engine skeleton is completed
|
||||
- C16 secure node-to-node channel lifecycle is completed
|
||||
- C17 mesh routing runtime implementation plan is completed
|
||||
- C17A synthetic mesh runtime skeleton is implemented and test-proven with
|
||||
synthetic fabric messages only, no RDP/VPN/production service traffic
|
||||
- C17B route health and failover probes are implemented and test-proven with
|
||||
synthetic traffic only, no RDP/VPN/production service traffic
|
||||
- C17C relay semantic hardening is implemented and test-proven with synthetic
|
||||
channel classes only, no RDP/VPN/production service traffic
|
||||
- C17D non-production test-service path is implemented and test-proven with
|
||||
bounded `synthetic.echo` traffic only, no RDP/VPN/production service traffic
|
||||
- C17E live node-to-node synthetic HTTP transport is implemented and
|
||||
smoke-proven with synthetic traffic only
|
||||
- C17F scoped synthetic route config loading and route-health reporting is
|
||||
implemented and smoke-proven with synthetic traffic only
|
||||
- C17G Control Plane scoped synthetic config read/consume is implemented and
|
||||
test-proven with synthetic traffic only
|
||||
- C17H deployed multi-agent synthetic config smoke is implemented and
|
||||
runtime-proven on `docker-test` with synthetic traffic only
|
||||
- C17I production forwarding gate foundation is implemented and test-proven;
|
||||
production forwarding remains unavailable
|
||||
- C17J production envelope contract validation is implemented and test-proven;
|
||||
production forwarding remains unavailable
|
||||
- C17K production envelope observation is implemented and test-proven;
|
||||
production forwarding remains unavailable
|
||||
- C17L bounded production observation sink is implemented and test-proven;
|
||||
production forwarding remains unavailable
|
||||
- C17M production observation sink wiring is implemented and test-proven;
|
||||
production forwarding remains unavailable
|
||||
- C17N production observation sink metrics are implemented and test-proven;
|
||||
production forwarding remains unavailable
|
||||
- C17O production observation sink local metrics logging is implemented and
|
||||
test-proven; production forwarding remains unavailable
|
||||
- C17P production observation sink change-driven metrics logging is implemented
|
||||
and test-proven; production forwarding remains unavailable
|
||||
- C17Q production forwarding gate/runtime log boundary is implemented and
|
||||
test-proven; production forwarding remains unavailable
|
||||
- C17R production observation sink capacity guard is implemented and
|
||||
test-proven; production forwarding remains unavailable
|
||||
- C17S production observation panic fail-closed hardening is implemented and
|
||||
test-proven; production forwarding remains unavailable
|
||||
- C17T production envelope payload boundary is implemented and test-proven;
|
||||
production forwarding remains unavailable
|
||||
- C17U production envelope created-at skew boundary is implemented and
|
||||
test-proven; production forwarding remains unavailable
|
||||
- C17V peer endpoint candidate model and NAT/connectivity hints are
|
||||
implemented and test-proven; production forwarding remains unavailable
|
||||
- C17W peer endpoint candidate scoring model is implemented and test-proven;
|
||||
production forwarding remains unavailable
|
||||
- C17X health-aware endpoint candidate scoring overlay is implemented and
|
||||
test-proven; production forwarding remains unavailable
|
||||
- C17Y Platform Owner synthetic mesh visibility is implemented and
|
||||
build/test-proven; production forwarding remains unavailable
|
||||
- C17Z production fabric-control direct forwarding is implemented and
|
||||
test-proven; production service traffic remains unavailable
|
||||
- C17Z1 production fabric-control multi-hop route-path forwarding is
|
||||
implemented and test-proven; production service traffic remains unavailable
|
||||
- C17Z2 production fabric-control forwarding observability is implemented and
|
||||
test-proven; production service traffic remains unavailable
|
||||
- C17Z3 production fabric-control route-config boundary is implemented and
|
||||
test-proven; production service traffic remains unavailable
|
||||
- C17Z4 scoped peer directory/recovery seed boundary is implemented and
|
||||
test/build-proven; production service traffic remains unavailable
|
||||
- C17Z5 node-agent peer cache runtime boundary is implemented and test-proven;
|
||||
production service traffic remains unavailable
|
||||
- C17Z6 dynamic endpoint reporting boundary is implemented and test-proven;
|
||||
production service traffic remains unavailable
|
||||
- C17Z7 private/corporate endpoint candidate boundary is implemented and
|
||||
test-proven; production service traffic remains unavailable
|
||||
- C17Z8 peer connection state machine boundary is implemented and test-proven;
|
||||
production service traffic remains unavailable
|
||||
- C17Z9 peer recovery planner boundary is implemented and test-proven;
|
||||
production service traffic remains unavailable
|
||||
- C17Z10 peer connection intent planner boundary is implemented and
|
||||
test-proven; production service traffic remains unavailable
|
||||
- C17Z11 peer connection manager runtime boundary is implemented and
|
||||
test-proven; production service traffic remains unavailable
|
||||
- C17Z12 rendezvous/relay control-plane contract is implemented and
|
||||
docker-test-runtime-proven; production service traffic remains unavailable
|
||||
- C17Z13 rendezvous lease telemetry is implemented and
|
||||
docker-test-runtime-proven; production service traffic remains unavailable
|
||||
- C17Z14 rendezvous lease refresh contract is implemented and
|
||||
docker-test-runtime-proven; production service traffic remains unavailable
|
||||
- C17Z15 backend relay replacement policy is implemented and
|
||||
docker-test-runtime-proven; production service traffic remains unavailable
|
||||
- C17Z16 route/path decision artifact is implemented and
|
||||
docker-test-runtime-proven; production service traffic remains unavailable
|
||||
- C17Z17 node-side route generation tracker is implemented and
|
||||
docker-test-runtime-proven; production service traffic remains unavailable
|
||||
- C17Z18 synthetic route-health effective path runtime is implemented and
|
||||
docker-test-runtime-proven; production service traffic remains unavailable
|
||||
- C17Z19 synthetic route-health feedback scoring is implemented and
|
||||
docker-test-runtime-proven; production service traffic remains unavailable
|
||||
- C17Z20 node-side route-health feedback refresh is implemented and
|
||||
docker-test-runtime-proven; production service traffic remains unavailable
|
||||
- Cluster Authority plus node enrollment bootstrap polling are docker-test
|
||||
lifecycle-smoke-proven; fresh install migration replay is fixed for
|
||||
`cluster_admin_summaries`
|
||||
- C18 VPN/IP tunnel service target design is completed as documentation only
|
||||
- C18A VPN/IP tunnel control-plane data model foundation is implemented and
|
||||
backend-test-proven
|
||||
- C18B VPN/IP tunnel lease/fencing hardening is implemented and
|
||||
backend-test-proven
|
||||
- C18C VPN/IP tunnel node-agent desired-state consumption/reporting is
|
||||
implemented and backend-test-proven
|
||||
- Version Storage / Update Repository is documented as a future Fabric Core
|
||||
service for signed release manifests, OS/arch artifacts,
|
||||
stable/current/candidate channels, update-cache mirroring, node-agent
|
||||
update supervision, rollback, and explicit data-structure migration bundles.
|
||||
Runtime updater behavior is not implemented.
|
||||
- no next platform-core implementation step is automatically authorized after
|
||||
C17Z20; choose the next narrow staged prompt explicitly before continuing
|
||||
- preserve the proven RDP lifecycle behavior
|
||||
- keep the current backend gateway available as the active/fallback implementation path
|
||||
|
||||
The current phase is NOT:
|
||||
- full mesh routing implementation
|
||||
- full VPN orchestration
|
||||
- multi-cluster runtime traffic handling
|
||||
- production data-plane migration
|
||||
- updater runtime
|
||||
- video meetings
|
||||
- final native client UI redesign
|
||||
|
||||
Future mesh, VPN, multi-cluster, node-agent updater, and production realtime data-plane work must be introduced only through explicit, narrow, staged implementation prompts.
|
||||
|
||||
Always keep the project production-oriented. Do not simplify it into a toy app.
|
||||
@@ -0,0 +1,119 @@
|
||||
# Remote Access Platform
|
||||
|
||||
Production-oriented Secure Access Fabric platform.
|
||||
|
||||
The project started as an RDP proxy, but the target architecture is broader:
|
||||
|
||||
- multi-tenant control plane
|
||||
- direct realtime data plane
|
||||
- service adapters for RDP now and VNC/SSH/VPN/file/video later
|
||||
- native Access Clients
|
||||
- future secure mesh / node-agent / updater / connector model
|
||||
|
||||
RDP is the first proven service baseline. RDP work is currently paused by
|
||||
product decision while the project moves to the Secure Access Fabric
|
||||
platform-core foundation: clusters, node enrollment, node-agent identity, role
|
||||
assignments, and platform administration.
|
||||
|
||||
## Current Baseline
|
||||
|
||||
- Backend: Go control plane with PostgreSQL as source of truth and Redis for
|
||||
live coordination/routing only.
|
||||
- Worker: active C++ RDP Adapter worker based on FreeRDP.
|
||||
- Windows client: C# / WPF native Access Client.
|
||||
- Data plane: direct worker WSS for realtime RDP when available, backend
|
||||
WebSocket gateway retained as fallback/debug.
|
||||
- Current test Docker host: `docker-test` / `192.168.200.61`.
|
||||
- Current test Docker deployment for Stage 5.2 proof:
|
||||
`rap-backend-smoke:stage5-2-download` and
|
||||
`rap-rdp-worker:stage5-2-download`.
|
||||
|
||||
See the current audit and baseline matrix before starting new work:
|
||||
|
||||
- `docs/audits/PROJECT_AUDIT_2026-04-26.md`
|
||||
- `docs/audits/CURRENT_BASELINE_MATRIX.md`
|
||||
|
||||
## Proven RDP Capabilities
|
||||
|
||||
- real RDP connect through worker
|
||||
- active/detach/reattach/takeover/terminate lifecycle
|
||||
- takeover without remote session recreation
|
||||
- worker-death/orphan-active-session recovery
|
||||
- direct worker WSS data plane
|
||||
- binary direct render frames
|
||||
- backend gateway JSON/base64 fallback
|
||||
- Windows client rendering and input
|
||||
- text clipboard with policy enforcement
|
||||
- client-to-server file upload into controlled worker storage
|
||||
- restricted transfer-drive visibility through `RAP_Transfers`, runtime-proven
|
||||
- server-to-client file download implementation through
|
||||
`RAP_Transfers\ToClient`; core direct/fallback data path is runtime-proven,
|
||||
lifecycle blocking is runtime-proven, and manual desktop UI acceptance remains
|
||||
pending
|
||||
|
||||
## Repository Structure
|
||||
|
||||
- `backend/` - Go control plane
|
||||
- `workers/rdp-worker/` - active C++ RDP Adapter worker
|
||||
- `workers/rdp-service-csharp/` - inactive research scaffold, not current runtime
|
||||
- `clients/windows/` - Windows native Access Client
|
||||
- `docs/architecture/` - target and staged architecture documents
|
||||
- `docs/codex/` - current Codex status and next-step prompts
|
||||
- `docs/audits/` - current audits and baseline matrices
|
||||
- `scripts/` - smoke and helper scripts
|
||||
- `deploy/` - deployment assets
|
||||
- `web-admin/` - future/admin UI area
|
||||
|
||||
## Read Order
|
||||
|
||||
1. `CODEX_CONTEXT.md`
|
||||
2. `docs/audits/PROJECT_AUDIT_2026-04-26.md`
|
||||
3. `docs/audits/CURRENT_BASELINE_MATRIX.md`
|
||||
4. `docs/codex/CURRENT_STATUS.md`
|
||||
5. `docs/codex/ARCHITECTURE_GUARDRAILS.md`
|
||||
6. `docs/architecture/RDP_ADAPTER_RUNTIME.md`
|
||||
7. `docs/architecture/DATA_PLANE_V1.md`
|
||||
8. `docs/architecture/CLUSTER_NODE_ADMIN_FOUNDATION.md`
|
||||
9. `docs/codex/NEXT_STEP_PROMPT.md`
|
||||
|
||||
Do not use `docs/_legacy_v1` for implementation decisions. Legacy files are
|
||||
historical reference only.
|
||||
|
||||
## Current Next Step
|
||||
|
||||
RDP work is paused. Platform-core stages C1-C9 are implemented and verified:
|
||||
cluster/node model foundation, node enrollment hardening, native
|
||||
`rap-node-agent` MVP, Platform Admin Console MVP, service workload supervision
|
||||
contract, mesh control-plane preparation, Mesh MVP skeleton, multi-cluster
|
||||
hardening, and organization admin foundation.
|
||||
|
||||
Planning baseline:
|
||||
|
||||
- `docs/architecture/CLUSTER_NODE_ADMIN_FOUNDATION.md`
|
||||
- `artifacts/c1-cluster-node-foundation-report.md`
|
||||
- `artifacts/c2-node-enrollment-hardening-report.md`
|
||||
- `artifacts/c3-rap-node-agent-mvp-report.md`
|
||||
- `artifacts/c4-platform-admin-console-report.md`
|
||||
- `artifacts/c5-service-workload-supervision-contract-report.md`
|
||||
- `artifacts/c6-mesh-control-plane-preparation-report.md`
|
||||
- `artifacts/c7-mesh-mvp-skeleton-report.md`
|
||||
- `artifacts/c8-multi-cluster-hardening-report.md`
|
||||
- `artifacts/c9-organization-admin-foundation-report.md`
|
||||
|
||||
Stage C1 implemented the backend foundation for:
|
||||
|
||||
- explicit clusters
|
||||
- cluster memberships
|
||||
- node join tokens with hashed tokens only
|
||||
- node join requests
|
||||
- node identity/certificate metadata
|
||||
- node role assignments
|
||||
- node heartbeat/latest health
|
||||
- cluster audit events
|
||||
- safe migration/backfill of existing node data into a default cluster
|
||||
|
||||
Recommended next step is a decision between platform auth hardening for
|
||||
web-admin, node PKI hardening, node-agent runtime hardening, mesh runtime DP-0,
|
||||
or returning to paused RDP work.
|
||||
|
||||
Do not start all continuation paths at once.
|
||||
@@ -0,0 +1,13 @@
|
||||
FROM golang:1.23-bookworm AS build
|
||||
|
||||
WORKDIR /src
|
||||
COPY agents/rap-node-agent/go.mod ./
|
||||
RUN go mod download
|
||||
COPY agents/rap-node-agent/ ./
|
||||
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -o /out/rap-node-agent ./cmd/rap-node-agent
|
||||
|
||||
FROM gcr.io/distroless/static-debian12:nonroot
|
||||
|
||||
COPY --from=build /out/rap-node-agent /usr/local/bin/rap-node-agent
|
||||
USER nonroot:nonroot
|
||||
ENTRYPOINT ["/usr/local/bin/rap-node-agent"]
|
||||
@@ -0,0 +1,487 @@
|
||||
# rap-node-agent
|
||||
|
||||
Native node agent MVP for the Secure Access Fabric.
|
||||
|
||||
Status: Stage C17Z18 synthetic route-health effective path boundary.
|
||||
|
||||
This agent is intentionally native. Containers may package service workloads,
|
||||
but the host-level node identity belongs to `rap-node-agent`.
|
||||
|
||||
## Current Scope
|
||||
|
||||
Implemented:
|
||||
|
||||
- config loading from flags/environment
|
||||
- local identity state file
|
||||
- enrollment request client
|
||||
- heartbeat client
|
||||
- capability/facts payload
|
||||
- status-only service reporting payload
|
||||
- mesh control-channel skeleton
|
||||
- route-health message skeleton
|
||||
- relay skeleton that refuses production payload forwarding
|
||||
- disabled-by-default synthetic mesh runtime for `fabric.probe` /
|
||||
`fabric.probe_ack`
|
||||
- direct and single-relay synthetic route tests
|
||||
- synthetic `fabric.route_health` / `fabric.route_health_ack`
|
||||
- local route success/failure observations
|
||||
- fallback route selection for test topology
|
||||
- route cache invalidation on version changes
|
||||
- synthetic relay envelope validation
|
||||
- per-channel bounded queues for synthetic traffic
|
||||
- QoS dequeue order: `fabric_control`, then `route_control`, then `telemetry`
|
||||
- telemetry-only stale message drop under backpressure
|
||||
- reliable fabric/control queue rejection when full
|
||||
- bounded non-production `synthetic.echo` test-service path
|
||||
- direct, single-relay, and forced-fallback test-service proofs
|
||||
- live HTTP peer transport for synthetic mesh envelopes
|
||||
- disabled-by-default synthetic mesh HTTP endpoint in `rap-node-agent`
|
||||
- `mesh-live-smoke` harness proving direct and single-relay synthetic traffic
|
||||
over real local HTTP endpoints
|
||||
- scoped synthetic mesh config file loading for peer endpoints and routes
|
||||
- Control Plane synthetic mesh config read fallback when no local scoped config
|
||||
file is set
|
||||
- synthetic route-health observations reported to the Control Plane when test
|
||||
flags allow synthetic links
|
||||
- explicit production mesh forwarding gate config; production forwarding still
|
||||
has no runtime implementation and remains unavailable
|
||||
- route-bound production mesh envelope contract and fail-closed validation on
|
||||
`/mesh/v1/forward`
|
||||
- metadata-only production envelope observation hook for valid envelopes, still
|
||||
without forwarding payloads
|
||||
- bounded metadata-only production envelope observation sink for accepted
|
||||
observations
|
||||
- disabled-by-default node-agent wiring for the bounded observation sink
|
||||
- local metrics for the bounded observation sink without exposing observation
|
||||
records
|
||||
- local node-agent logging for bounded observation sink metrics
|
||||
- change-driven suppression for unchanged bounded observation sink metrics logs
|
||||
- explicit local log distinction between production forwarding gate state and
|
||||
production forwarding runtime state
|
||||
- node-scoped rendezvous lease refresh through Control Plane synthetic config
|
||||
- stale relay withdrawal/reselection telemetry
|
||||
- relay replacement contract reporting for stale rendezvous relays
|
||||
- route/path decision contract reporting for control-plane route generations
|
||||
- route generation apply/withdraw tracking for control-plane path decisions
|
||||
- synthetic route-health route config refresh from Control Plane path
|
||||
decisions
|
||||
- route-health expected/observed effective path drift reporting
|
||||
- maximum capacity guard for the local production observation sink
|
||||
- panic-safe fail-closed production envelope observation wrapper
|
||||
- explicit `4096` byte payload boundary for validated production
|
||||
fabric-control envelopes
|
||||
- explicit future-skew boundary for validated production envelope `created_at`
|
||||
- scoped synthetic peer endpoint candidate config with reachability,
|
||||
NAT/connectivity hints, priority, policy tags, and metadata
|
||||
- deterministic local peer endpoint candidate scoring model for synthetic
|
||||
config candidates
|
||||
- optional local health observation overlay for endpoint candidate scoring
|
||||
- gate-controlled production `fabric.control` direct next-hop delivery
|
||||
- route-path-bound production `fabric.control` multi-hop forwarding
|
||||
- local metadata-only production `fabric.control` forwarding event logs
|
||||
- route-config-bound production `fabric.control` forwarding validation
|
||||
- scoped peer directory and bounded recovery seed config parsing/validation
|
||||
- node-local peer cache with bounded warm peer health probes
|
||||
- advertised mesh endpoint reporting through heartbeat metadata
|
||||
- multiple advertised endpoint candidates, including private/corporate LAN
|
||||
- peer connection state machine for warm-peer health
|
||||
- bounded peer recovery planner over peer cache and connection states
|
||||
- peer connection intent planner with transport readiness classification
|
||||
- peer connection manager for real control-plane health over reusable
|
||||
HTTP keep-alive transport
|
||||
- route-health effective-path runtime through replacement relay control paths
|
||||
|
||||
Not implemented yet:
|
||||
|
||||
- mesh packet routing
|
||||
- production mesh service traffic
|
||||
- VPN runtime
|
||||
- production workload supervision
|
||||
- certificate issuance/rotation
|
||||
- updater runtime
|
||||
- privileged host route/firewall control
|
||||
|
||||
## Build
|
||||
|
||||
```powershell
|
||||
cd agents\rap-node-agent
|
||||
go test ./...
|
||||
go build -o bin\rap-node-agent.exe .\cmd\rap-node-agent
|
||||
go build -o bin\mesh-live-smoke.exe .\cmd\mesh-live-smoke
|
||||
```
|
||||
|
||||
## First Enrollment
|
||||
|
||||
Create a join token from the platform control plane, then run:
|
||||
|
||||
```powershell
|
||||
.\bin\rap-node-agent.exe `
|
||||
-backend-url http://192.168.200.61:8080/api/v1 `
|
||||
-cluster-id <cluster_id> `
|
||||
-join-token <raw_join_token> `
|
||||
-node-name test-node-1 `
|
||||
-state-dir C:\ProgramData\RapNodeAgent
|
||||
```
|
||||
|
||||
The agent submits a pending join request and exits. It does not self-activate.
|
||||
A platform admin must approve the join request.
|
||||
|
||||
## Enrollment Approval
|
||||
|
||||
When the agent enrolls, it stores the returned `pending_join_request_id` and
|
||||
polls the Control Plane bootstrap endpoint until the platform owner approves
|
||||
the request or the enrollment timeout expires. After approval, the agent
|
||||
verifies the signed bootstrap contract and writes the approved `node_id`,
|
||||
`cluster_id`, `identity_status=active`, `cluster_authority_public_key`, and
|
||||
`cluster_authority_fingerprint` into `identity.json`.
|
||||
|
||||
Future C3 hardening can add signed node certificates and automatic secure
|
||||
certificate material exchange.
|
||||
|
||||
Then run the agent again:
|
||||
|
||||
```powershell
|
||||
.\bin\rap-node-agent.exe `
|
||||
-backend-url http://192.168.200.61:8080/api/v1 `
|
||||
-state-dir C:\ProgramData\RapNodeAgent
|
||||
```
|
||||
|
||||
It sends periodic heartbeats to:
|
||||
|
||||
```text
|
||||
/api/v1/clusters/{clusterID}/nodes/{nodeID}/heartbeats
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
- `RAP_BACKEND_URL`
|
||||
- `RAP_CLUSTER_ID`
|
||||
- `RAP_CLUSTER_AUTHORITY_PUBLIC_KEY`
|
||||
- `RAP_CLUSTER_AUTHORITY_FINGERPRINT`
|
||||
- `RAP_JOIN_TOKEN`
|
||||
- `RAP_NODE_NAME`
|
||||
- `RAP_NODE_STATE_DIR`
|
||||
- `RAP_WORKLOAD_SUPERVISION_ENABLED`
|
||||
- `RAP_HEARTBEAT_INTERVAL_SECONDS`
|
||||
- `RAP_ENROLLMENT_POLL_INTERVAL_SECONDS`
|
||||
- `RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS`
|
||||
- `RAP_MESH_SYNTHETIC_RUNTIME_ENABLED`
|
||||
- `RAP_MESH_LISTEN_ADDR`
|
||||
- `RAP_MESH_ADVERTISE_ENDPOINT`
|
||||
- `RAP_MESH_ADVERTISE_ENDPOINTS_JSON`
|
||||
- `RAP_MESH_ADVERTISE_TRANSPORT`
|
||||
- `RAP_MESH_CONNECTIVITY_MODE`
|
||||
- `RAP_MESH_NAT_TYPE`
|
||||
- `RAP_MESH_REGION`
|
||||
- `RAP_MESH_SYNTHETIC_CONFIG`
|
||||
- `RAP_MESH_PEER_ENDPOINTS_JSON`
|
||||
- `RAP_MESH_SYNTHETIC_ROUTES_JSON`
|
||||
- `RAP_MESH_PRODUCTION_FORWARDING_ENABLED`
|
||||
- `RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY`
|
||||
|
||||
`RAP_MESH_SYNTHETIC_RUNTIME_ENABLED` defaults to `false`. It gates only the
|
||||
C17A/C17B/C17C/C17D/C17E synthetic probe, route-health, relay scheduling,
|
||||
bounded `synthetic.echo` test-service runtime, and live synthetic HTTP endpoint.
|
||||
It must not be used for RDP, VPN, file, video, or other production service
|
||||
traffic.
|
||||
|
||||
`RAP_WORKLOAD_SUPERVISION_ENABLED` defaults to `false`. While service runtime
|
||||
supervision is still a stub, the agent does not poll desired workloads or report
|
||||
workload status unless this flag is explicitly enabled.
|
||||
|
||||
`RAP_MESH_LISTEN_ADDR` starts the C17E/C17F/C17G synthetic HTTP endpoint only when
|
||||
`RAP_MESH_SYNTHETIC_RUNTIME_ENABLED=true`. `RAP_MESH_SYNTHETIC_CONFIG` points to
|
||||
a scoped synthetic mesh config snapshot and is preferred over debug JSON.
|
||||
`RAP_MESH_PEER_ENDPOINTS_JSON` is a JSON object mapping peer node IDs to
|
||||
endpoint URLs. `RAP_MESH_SYNTHETIC_ROUTES_JSON` is a JSON array of synthetic
|
||||
route objects. If no local scoped config file is set, the agent asks the
|
||||
Control Plane for:
|
||||
|
||||
```text
|
||||
/clusters/{clusterID}/nodes/{nodeID}/mesh/synthetic-config
|
||||
```
|
||||
|
||||
The JSON variables are debug fallback only.
|
||||
|
||||
Control Plane synthetic config with `authority_required=true` must include a
|
||||
signed `authority_payload` / `authority_signature` envelope and a
|
||||
`cluster_authority` descriptor. The agent verifies the signature, validates the
|
||||
config hash, and rejects mismatched pinned authority values when
|
||||
`RAP_CLUSTER_AUTHORITY_PUBLIC_KEY`, `RAP_CLUSTER_AUTHORITY_FINGERPRINT`, or the
|
||||
same fields in `identity.json` are set.
|
||||
|
||||
`RAP_MESH_PRODUCTION_FORWARDING_ENABLED` defaults to `false`. It is a future
|
||||
production-forwarding gate only. Turning it on does not enable production mesh
|
||||
payload forwarding; `/mesh/v1/forward` still returns an unavailable runtime
|
||||
response after validating the route-bound production envelope contract, until
|
||||
a later approved production mesh stage implements route-bound, policy-bound
|
||||
forwarding.
|
||||
|
||||
The production envelope contract requires route, hop, TTL, expiry, payload
|
||||
length, and SHA-256 payload hash fields. C17J accepts only the
|
||||
`fabric_control` channel class and `fabric.control` message type for
|
||||
validation. RDP, VPN, render, file, video, and service workload channels are
|
||||
rejected.
|
||||
|
||||
C17K adds a local metadata-only observation hook after successful production
|
||||
envelope validation. Observations include route/message/hop/channel metadata and
|
||||
payload length/hash, not the payload body. Observation failure fails closed, and
|
||||
the endpoint still does not forward payloads.
|
||||
|
||||
C17L adds a bounded in-memory observation sink for accepted metadata-only
|
||||
observations. The sink drops the oldest observation when full and still stores
|
||||
no payload bodies.
|
||||
|
||||
`RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY` defaults to `0`. When set above
|
||||
zero, C17M wires the bounded metadata-only sink into the node-agent mesh server.
|
||||
This remains local-only, exposes no read API, stores no payload bodies, and
|
||||
does not enable production forwarding. C17R rejects values above `10000`.
|
||||
|
||||
C17N adds local sink metrics: configured capacity, current depth, accepted
|
||||
total, and dropped-oldest total. Metrics do not expose observation records,
|
||||
route IDs, message IDs, hashes, payload metadata, or payload bodies.
|
||||
|
||||
C17O logs those aggregate metrics locally from the node-agent loop when the
|
||||
sink is explicitly enabled. This does not add a read API or Control Plane
|
||||
reporting.
|
||||
|
||||
C17P logs aggregate sink metrics only when they change, so steady heartbeat
|
||||
loops do not repeat identical local metrics lines.
|
||||
|
||||
C17Q logs `production_forwarding_gate_enabled` separately from
|
||||
`production_forwarding_runtime_enabled`. The runtime field remains `false`;
|
||||
turning on the gate still does not enable production forwarding.
|
||||
|
||||
C17S makes production envelope observation panic-safe. Observer errors and
|
||||
observer panics both fail closed as observation failure; forwarding remains
|
||||
unavailable.
|
||||
|
||||
C17T limits validated production `fabric.control` envelope payloads to 4096
|
||||
bytes. Oversized envelopes are rejected before observation.
|
||||
|
||||
C17U rejects production `fabric.control` envelopes whose `created_at` is more
|
||||
than one minute in the future.
|
||||
|
||||
C17V adds scoped peer endpoint candidates to synthetic mesh config. Candidate
|
||||
entries describe possible per-node endpoints with transport, address,
|
||||
reachability, NAT type, connectivity mode, priority, policy tags, verification
|
||||
time, and metadata. They are model/config hints only; no production route
|
||||
scoring, NAT traversal, shortcut routing, or forwarding runtime is implemented.
|
||||
|
||||
C17W adds deterministic local scoring for scoped endpoint candidates. Scoring
|
||||
uses transport, reachability, connectivity mode, NAT type, priority, preferred
|
||||
region, policy tags, channel class, and verification age. It returns ranked
|
||||
candidates and reason labels only; it does not select production routes, open
|
||||
connections, perform NAT traversal, or forward payloads.
|
||||
|
||||
C17X extends candidate scoring with optional local health observations keyed by
|
||||
`endpoint_id`. Observations can contribute latency, success/failure history,
|
||||
recent failure reason, reliability score, and freshness/staleness signals.
|
||||
The score remains advisory only and is not wired into production forwarding.
|
||||
|
||||
C17Z adds the first narrow production forwarding runtime. When
|
||||
`RAP_MESH_PRODUCTION_FORWARDING_ENABLED=true`, `/mesh/v1/forward` can deliver
|
||||
route-bound `fabric.control` envelopes at the local destination or forward them
|
||||
to a direct next hop from explicit peer endpoint config. Service channels,
|
||||
RDP/VPN/file/video payloads, arbitrary relay forwarding, and multi-hop
|
||||
production route execution remain unavailable.
|
||||
|
||||
C17Z1 adds route-path-bound multi-hop forwarding for production
|
||||
`fabric.control` only. Envelopes may carry `route_path` and
|
||||
`visited_node_ids`; each relay validates its path position, forwards only to
|
||||
the next route-path node, updates TTL/hop/visited metadata, and rejects loops.
|
||||
Service payloads remain unavailable.
|
||||
|
||||
C17Z2 emits local `mesh_production_forward_event` logs for production
|
||||
`fabric.control` forwarding outcomes: accepted, forwarded, delivered, and
|
||||
rejected. Logs include route/message/hop/channel/status/reason/TTL/hop count/
|
||||
route path length/visited count/payload length metadata only. Payload bodies
|
||||
are not logged, no observation read API is added, and service payloads remain
|
||||
unavailable.
|
||||
|
||||
C17Z3 binds production `fabric.control` forwarding to loaded scoped or
|
||||
Control Plane route config when routes are available locally. Configured
|
||||
envelopes must match `route_id`, cluster, source, destination, route path,
|
||||
next hop, allowed channel, expiry, max TTL, and max hop count before
|
||||
forwarding. If no route config is present, existing C17Z1 behavior is
|
||||
preserved. Service payloads remain unavailable.
|
||||
|
||||
C17Z4 adds scoped peer directory and recovery seed config. `peer_directory`
|
||||
describes only peers needed by the node-scoped mesh config. `recovery_seeds`
|
||||
is an explicit, bounded bootstrap/recovery list and is not a full cluster node
|
||||
list. The node-agent parses and validates these fields, but does not yet
|
||||
implement a persistent connection manager, NAT traversal, or
|
||||
relay/rendezvous runtime.
|
||||
|
||||
C17Z5 turns scoped peer directory and recovery seed config into node-local
|
||||
runtime `PeerCache` state. The cache builds a bounded warm peer set from
|
||||
route-adjacent peers, recovery seeds, peer endpoints, and endpoint candidates.
|
||||
When synthetic mesh testing is enabled, the node-agent probes warm peers with
|
||||
`/mesh/v1/health` and reports metadata-only mesh-link observations. This is not
|
||||
a persistent connection manager and does not forward service payloads.
|
||||
|
||||
C17Z6 adds advertised mesh endpoint reporting. When
|
||||
`RAP_MESH_ADVERTISE_ENDPOINT` is set, node-agent includes a
|
||||
`mesh_endpoint_report` in heartbeat metadata with transport, connectivity mode,
|
||||
NAT hint, region, observed time, and endpoint candidate metadata. Control Plane
|
||||
can project the latest reported endpoint into node-scoped synthetic mesh config
|
||||
for route-path peers. This does not perform automatic public IP discovery,
|
||||
STUN/TURN/ICE NAT classification, or service payload forwarding.
|
||||
|
||||
C17Z7 adds `RAP_MESH_ADVERTISE_ENDPOINTS_JSON` for multiple advertised
|
||||
endpoints per node. Candidates can describe public, private, corporate/LAN,
|
||||
outbound, or relay-style addresses. Endpoint scoring rewards `private-lan`,
|
||||
`corp-lan`, and `same-site` policy tags, and peer cache can use the best
|
||||
candidate address for warm-peer health probes. This supports corporate-network
|
||||
cluster segments without enabling service payload forwarding.
|
||||
|
||||
C17Z8 adds a node-local peer connection state machine on top of warm-peer
|
||||
health probes. Warm peers move through `disconnected`, `connecting`, `ready`,
|
||||
`degraded`, and `backoff`; repeated probe failures enter bounded backoff, and
|
||||
successful probes recover to `ready`. Mesh-link observations include
|
||||
metadata-only connection state. This is not a persistent socket/session manager
|
||||
and does not forward service payloads.
|
||||
|
||||
C17Z9 adds a node-local peer recovery planner. The node targets a bounded
|
||||
stable ready-peer set, defaulting to three connectable peers when available,
|
||||
instead of probing every known cluster node. When ready peers fall below target,
|
||||
the planner selects bounded recovery probes from warm peers, recovery seeds,
|
||||
and other connectable scoped peers, skipping active backoff entries. Heartbeats
|
||||
include metadata-only `mesh_peer_recovery_report` state. This is not persistent
|
||||
connection transport, NAT traversal, relay/rendezvous runtime, or service
|
||||
payload forwarding.
|
||||
|
||||
C17Z10 adds a node-local peer connection intent planner over the C17Z9 recovery
|
||||
plan. It classifies bounded peer work as `maintain`, `probe`, or `recover`,
|
||||
and classifies transport readiness as `direct`, `private_lan`,
|
||||
`corporate_lan`, `outbound_only`, or `relay_required`. Heartbeats include
|
||||
metadata-only `mesh_peer_connection_intent_report` counts. This is not
|
||||
persistent connection transport, STUN/TURN/ICE, NAT traversal, relay runtime,
|
||||
or service payload forwarding.
|
||||
|
||||
C17Z11 adds the first real node-local peer connection manager for mesh
|
||||
control-plane health. It uses a reusable HTTP keep-alive client to probe
|
||||
direct/private/corporate peer endpoints selected by C17Z10 intents, updates
|
||||
the shared peer connection tracker, and records `waiting_rendezvous` for
|
||||
outbound-only or relay-required peers. Heartbeats include metadata-only
|
||||
`mesh_peer_connection_manager_report` state. This is not STUN/TURN/ICE,
|
||||
relay/rendezvous runtime, route lease generation, VPN runtime, or service
|
||||
payload forwarding.
|
||||
|
||||
C17Z12 adds a node-scoped rendezvous/relay control-plane lease contract for
|
||||
peers that would otherwise remain `waiting_rendezvous`. The agent consumes
|
||||
`rendezvous_leases`, resolves matching intents into `relay_control`, probes the
|
||||
relay node `/mesh/v1/health`, and records `relay_ready` for the peer control
|
||||
path. This remains control-plane health only and does not enable RDP/VPN/file/
|
||||
video/service payload forwarding, arbitrary relay packet forwarding,
|
||||
STUN/TURN/ICE, or host networking changes.
|
||||
|
||||
C17Z13 adds heartbeat telemetry for rendezvous lease admission and renewal
|
||||
posture. The agent emits `mesh_rendezvous_lease_report` with local role,
|
||||
relay/peer admission counts, TTL, renewal-after time, renewal-needed status,
|
||||
`relay_ready`, and explicit no-payload boundary flags. This remains
|
||||
metadata-only control-plane telemetry and does not enable service payload
|
||||
forwarding.
|
||||
|
||||
C17Z14 adds a control-plane refresh contract for rendezvous leases. When a
|
||||
lease is renewal-needed, expired, invalid, or tied to a stale relay state, the
|
||||
agent reloads node-scoped synthetic config from Control Plane, updates the
|
||||
running peer cache/route/lease state, and reports refresh counters plus stale
|
||||
relay withdrawal/reselection fields. This remains control-plane health only
|
||||
and does not enable service payload forwarding.
|
||||
|
||||
C17Z15 adds the node side of backend relay replacement policy. The agent
|
||||
advertises the relay replacement contract capability and emits
|
||||
`c17z15.mesh_rendezvous_lease_report.v1`; stale relay state is matched to the
|
||||
exact rendezvous lease/relay when that metadata is present, so an alternate
|
||||
replacement lease for the same peer is not treated as stale by association.
|
||||
This remains control-plane health only and does not enable service payload
|
||||
forwarding.
|
||||
|
||||
C17Z16 adds route/path decision reporting. The agent consumes
|
||||
`route_path_decisions` from Control Plane synthetic config, keeps the latest
|
||||
control-plane generation in local state, and emits
|
||||
`c17z18.mesh_route_path_decision_report.v1` with effective hops, previous/next
|
||||
hop, selected replacement relay, generation, and no-payload boundary flags.
|
||||
This remains metadata-only route planning and does not enable service payload
|
||||
forwarding.
|
||||
|
||||
C17Z17 adds node-side route generation tracking for Control Plane
|
||||
`route_path_decisions`. The agent emits
|
||||
`c17z18.mesh_route_generation_report.v1` with active, applied, unchanged, and
|
||||
withdrawn decision counts, total counters, generation change state, active
|
||||
decision details, and withdrawn decision details. When the first observed
|
||||
config already contains a stale relay replacement, the tracker emits a
|
||||
`withdrawn_by_replacement` record for the old relay path. This remains
|
||||
metadata-only route planning and does not enable service payload forwarding.
|
||||
|
||||
C17Z18 applies Control Plane `route_path_decisions` to synthetic route-health
|
||||
route config only. The agent keeps base routes separate from route-health
|
||||
routes, periodically refreshes scoped config, emits
|
||||
`c17z18.mesh_route_health_config_report.v1`, and reports route-health
|
||||
observations with expected/observed hops and drift status. This probes
|
||||
replacement relay effective paths for control-plane health only and does not
|
||||
enable service payload forwarding.
|
||||
|
||||
Scoped synthetic config shape:
|
||||
|
||||
```json
|
||||
{
|
||||
"schema_version": "c17z18.synthetic.v1",
|
||||
"cluster_id": "cluster-1",
|
||||
"local_node_id": "node-a",
|
||||
"config_version": "config-v1",
|
||||
"peer_directory_version": "peers-v1",
|
||||
"policy_version": "policy-v1",
|
||||
"peer_endpoints": {
|
||||
"node-b": "http://127.0.0.1:19002"
|
||||
},
|
||||
"peer_endpoint_candidates": {
|
||||
"node-b": [
|
||||
{
|
||||
"endpoint_id": "node-b-public",
|
||||
"node_id": "node-b",
|
||||
"transport": "direct_tcp_tls",
|
||||
"address": "203.0.113.20:443",
|
||||
"reachability": "public",
|
||||
"nat_type": "restricted",
|
||||
"connectivity_mode": "direct",
|
||||
"priority": 10
|
||||
}
|
||||
]
|
||||
},
|
||||
"routes": [],
|
||||
"route_path_decisions": {
|
||||
"schema_version": "c17z18.route_path_decisions.v1",
|
||||
"decisions": []
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## C17E Live Synthetic Smoke
|
||||
|
||||
Run:
|
||||
|
||||
```powershell
|
||||
cd agents\rap-node-agent
|
||||
go run .\cmd\mesh-live-smoke
|
||||
```
|
||||
|
||||
Expected:
|
||||
|
||||
- scoped synthetic config loads
|
||||
- direct `node-a -> node-b` synthetic probe succeeds
|
||||
- relay `node-a -> node-r -> node-b` synthetic probe succeeds
|
||||
- bounded `synthetic.echo` test-service succeeds
|
||||
- `production_forwarding=false`
|
||||
|
||||
## Safety Rules
|
||||
|
||||
- The agent never assigns roles to itself.
|
||||
- The agent reports capabilities only.
|
||||
- Platform policy assigns roles.
|
||||
- No RDP/VPN/production service traffic is carried by the C17A-C17Z18 staged
|
||||
mesh runtime.
|
||||
- Production forwarding remains disabled by default and limited to
|
||||
`fabric.control` when explicitly enabled.
|
||||
- No privileged operations are performed by the current agent.
|
||||
@@ -0,0 +1,186 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh"
|
||||
)
|
||||
|
||||
type smokeNode struct {
|
||||
Local mesh.PeerIdentity
|
||||
Runtime *mesh.SyntheticRuntime
|
||||
URL string
|
||||
server *httptest.Server
|
||||
}
|
||||
|
||||
type smokeReport struct {
|
||||
Stage string `json:"stage"`
|
||||
ProductionForwarding bool `json:"production_forwarding"`
|
||||
ScopedConfigLoaded bool `json:"scoped_config_loaded"`
|
||||
DirectProbeAccepted bool `json:"direct_probe_accepted"`
|
||||
DirectPath []string `json:"direct_path"`
|
||||
RelayProbeAccepted bool `json:"relay_probe_accepted"`
|
||||
RelayPath []string `json:"relay_path"`
|
||||
TestServiceAccepted bool `json:"test_service_accepted"`
|
||||
TestServiceEchoPayload string `json:"test_service_echo_payload"`
|
||||
PeerEndpoints map[string]any `json:"peer_endpoints"`
|
||||
}
|
||||
|
||||
func main() {
|
||||
report, err := run(context.Background())
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "mesh live smoke failed: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
payload, err := json.MarshalIndent(report, "", " ")
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "marshal report: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
fmt.Println(string(payload))
|
||||
}
|
||||
|
||||
func run(ctx context.Context) (smokeReport, error) {
|
||||
nodeA := newSmokeNode(mesh.PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
defer nodeA.Close()
|
||||
nodeR := newSmokeNode(mesh.PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"})
|
||||
defer nodeR.Close()
|
||||
nodeB := newSmokeNode(mesh.PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"})
|
||||
defer nodeB.Close()
|
||||
|
||||
directRoute := smokeRoute("route-direct", []string{"node-a", "node-b"})
|
||||
relayRoute := smokeRoute("route-relay", []string{"node-a", "node-r", "node-b"})
|
||||
routes := []mesh.SyntheticRoute{directRoute, relayRoute}
|
||||
nodeAConfigPath, err := writeSmokeScopedConfig(nodeA.Local, map[string]string{
|
||||
"node-r": nodeR.URL,
|
||||
"node-b": nodeB.URL,
|
||||
}, routes)
|
||||
if err != nil {
|
||||
return smokeReport{}, err
|
||||
}
|
||||
nodeAConfig, err := mesh.LoadScopedSyntheticConfig(nodeAConfigPath, nodeA.Local)
|
||||
if err != nil {
|
||||
return smokeReport{}, fmt.Errorf("load node-a scoped config: %w", err)
|
||||
}
|
||||
|
||||
nodeA.Runtime = smokeRuntime(nodeA.Local, nodeAConfig.Routes, nodeAConfig.PeerEndpoints)
|
||||
nodeR.Runtime = smokeRuntime(nodeR.Local, routes, map[string]string{
|
||||
"node-b": nodeB.URL,
|
||||
})
|
||||
nodeB.Runtime = smokeRuntime(nodeB.Local, routes, map[string]string{})
|
||||
|
||||
directAck, err := nodeA.Runtime.SendProbe(ctx, directRoute.RouteID, mesh.SyntheticChannelFabricControl, "smoke-direct")
|
||||
if err != nil {
|
||||
return smokeReport{}, fmt.Errorf("direct probe: %w", err)
|
||||
}
|
||||
relayAck, err := nodeA.Runtime.SendProbe(ctx, relayRoute.RouteID, mesh.SyntheticChannelFabricControl, "smoke-relay")
|
||||
if err != nil {
|
||||
return smokeReport{}, fmt.Errorf("relay probe: %w", err)
|
||||
}
|
||||
testService, err := nodeA.Runtime.SendTestService(ctx, relayRoute.RouteID, mesh.SyntheticChannelRouteControl, mesh.SyntheticTestServiceRequest{
|
||||
RequestID: "smoke-test-service",
|
||||
OrganizationID: mesh.SyntheticDefaultTestOrganizationID,
|
||||
ServiceType: mesh.SyntheticTestServiceType,
|
||||
Payload: "hello-c17e",
|
||||
SentAt: time.Now().UTC(),
|
||||
})
|
||||
if err != nil {
|
||||
return smokeReport{}, fmt.Errorf("test service: %w", err)
|
||||
}
|
||||
|
||||
return smokeReport{
|
||||
Stage: "C17F scoped synthetic config plus live HTTP transport",
|
||||
ProductionForwarding: false,
|
||||
ScopedConfigLoaded: nodeAConfig.ConfigVersion == "smoke-config-v1",
|
||||
DirectProbeAccepted: directAck.MessageType == mesh.SyntheticMessageProbeAck,
|
||||
DirectPath: decodeProbePath(directAck),
|
||||
RelayProbeAccepted: relayAck.MessageType == mesh.SyntheticMessageProbeAck,
|
||||
RelayPath: decodeProbePath(relayAck),
|
||||
TestServiceAccepted: testService.Ack.MessageType == mesh.SyntheticMessageTestServiceAck,
|
||||
TestServiceEchoPayload: testService.Response.EchoPayload,
|
||||
PeerEndpoints: map[string]any{
|
||||
"node-a": nodeA.URL,
|
||||
"node-r": nodeR.URL,
|
||||
"node-b": nodeB.URL,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func writeSmokeScopedConfig(local mesh.PeerIdentity, peers map[string]string, routes []mesh.SyntheticRoute) (string, error) {
|
||||
path := filepath.Join(os.TempDir(), "rap-c17e-node-a-scoped-mesh.json")
|
||||
payload, err := json.Marshal(mesh.ScopedSyntheticConfig{
|
||||
SchemaVersion: "c17f.synthetic.v1",
|
||||
ClusterID: local.ClusterID,
|
||||
LocalNodeID: local.NodeID,
|
||||
ConfigVersion: "smoke-config-v1",
|
||||
PeerDirectoryVersion: "smoke-peers-v1",
|
||||
PolicyVersion: "smoke-policy-v1",
|
||||
PeerEndpoints: peers,
|
||||
Routes: routes,
|
||||
})
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if err := os.WriteFile(path, payload, 0o600); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return path, nil
|
||||
}
|
||||
|
||||
func newSmokeNode(local mesh.PeerIdentity) *smokeNode {
|
||||
node := &smokeNode{Local: local}
|
||||
node.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
mesh.Server{Local: node.Local, SyntheticRuntime: node.Runtime}.Handler().ServeHTTP(w, r)
|
||||
}))
|
||||
node.URL = node.server.URL
|
||||
return node
|
||||
}
|
||||
|
||||
func (n *smokeNode) Close() {
|
||||
if n.server != nil {
|
||||
n.server.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func smokeRuntime(local mesh.PeerIdentity, routes []mesh.SyntheticRoute, peers map[string]string) *mesh.SyntheticRuntime {
|
||||
return mesh.NewSyntheticRuntime(mesh.SyntheticRuntimeConfig{
|
||||
Enabled: true,
|
||||
Local: local,
|
||||
Routes: routes,
|
||||
AllowedChannels: []string{
|
||||
mesh.SyntheticChannelFabricControl,
|
||||
mesh.SyntheticChannelRouteControl,
|
||||
},
|
||||
Transport: mesh.NewHTTPPeerTransport(peers),
|
||||
})
|
||||
}
|
||||
|
||||
func smokeRoute(routeID string, hops []string) mesh.SyntheticRoute {
|
||||
return mesh.SyntheticRoute{
|
||||
RouteID: routeID,
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: hops[0],
|
||||
DestinationNodeID: hops[len(hops)-1],
|
||||
Hops: hops,
|
||||
AllowedChannels: []string{mesh.SyntheticChannelFabricControl, mesh.SyntheticChannelRouteControl},
|
||||
MaxTTL: 8,
|
||||
MaxHops: 8,
|
||||
ExpiresAt: time.Now().UTC().Add(time.Hour),
|
||||
RouteVersion: "route-v1",
|
||||
PolicyVersion: "policy-v1",
|
||||
PeerDirectoryVersion: "peers-v1",
|
||||
}
|
||||
}
|
||||
|
||||
func decodeProbePath(envelope mesh.SyntheticEnvelope) []string {
|
||||
var payload mesh.SyntheticProbeAckPayload
|
||||
_ = json.Unmarshal(envelope.Payload, &payload)
|
||||
return payload.Path
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,3 @@
|
||||
module github.com/example/remote-access-platform/agents/rap-node-agent
|
||||
|
||||
go 1.23.2
|
||||
@@ -0,0 +1,70 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
|
||||
)
|
||||
|
||||
const Version = "0.1.0-c3"
|
||||
|
||||
func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) client.EnrollRequest {
|
||||
return client.EnrollRequest{
|
||||
ClusterID: clusterID,
|
||||
JoinToken: joinToken,
|
||||
NodeName: identity.NodeName,
|
||||
NodeFingerprint: identity.NodeFingerprint,
|
||||
PublicKey: identity.PublicKey,
|
||||
ReportedCapabilities: map[string]any{
|
||||
"can_accept_client_ingress": false,
|
||||
"can_accept_node_ingress": false,
|
||||
"can_route_mesh": false,
|
||||
"can_run_rdp_worker": true,
|
||||
"can_run_vnc_worker": false,
|
||||
"can_run_vpn_exit": false,
|
||||
"can_run_vpn_connector": false,
|
||||
"can_run_file_cache": false,
|
||||
"can_run_update_cache": false,
|
||||
"can_run_video_relay": false,
|
||||
"native_node_agent_version": Version,
|
||||
"service_supervision_enabled": false,
|
||||
},
|
||||
ReportedFacts: map[string]any{
|
||||
"os": runtime.GOOS,
|
||||
"arch": runtime.GOARCH,
|
||||
"agent": "rap-node-agent",
|
||||
"agent_ver": Version,
|
||||
},
|
||||
RequestedRoles: []string{},
|
||||
}
|
||||
}
|
||||
|
||||
func HeartbeatPayload() client.HeartbeatRequest {
|
||||
return client.HeartbeatRequest{
|
||||
HealthStatus: "healthy",
|
||||
ReportedVersion: Version,
|
||||
Capabilities: map[string]any{
|
||||
"native_node_agent": true,
|
||||
},
|
||||
ServiceStates: map[string]any{
|
||||
"workload_supervision": "not_implemented_c3",
|
||||
},
|
||||
Metadata: map[string]any{
|
||||
"stage": "c3",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func MeshSelfObservationPayload(identity state.Identity) client.MeshLinkObservationRequest {
|
||||
return client.MeshLinkObservationRequest{
|
||||
SourceNodeID: identity.NodeID,
|
||||
TargetNodeID: identity.NodeID,
|
||||
LinkStatus: "reachable",
|
||||
Metadata: map[string]any{
|
||||
"stage": "c6",
|
||||
"traffic_forwarding": false,
|
||||
"observation_type": "self",
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
|
||||
)
|
||||
|
||||
func TestEnrollmentPayloadDoesNotRequestRolesByDefault(t *testing.T) {
|
||||
payload := EnrollmentPayload("cluster-1", "join-token", state.Identity{
|
||||
NodeName: "node-a",
|
||||
NodeFingerprint: "fp",
|
||||
PublicKey: "pub",
|
||||
})
|
||||
if payload.ClusterID != "cluster-1" || payload.JoinToken != "join-token" {
|
||||
t.Fatalf("unexpected enrollment payload: %+v", payload)
|
||||
}
|
||||
if len(payload.RequestedRoles) != 0 {
|
||||
t.Fatalf("agent must not self-assign roles: %+v", payload.RequestedRoles)
|
||||
}
|
||||
if payload.ReportedCapabilities["can_run_rdp_worker"] != true {
|
||||
t.Fatalf("expected rdp capability in MVP payload: %+v", payload.ReportedCapabilities)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHeartbeatPayloadIsStatusOnly(t *testing.T) {
|
||||
payload := HeartbeatPayload()
|
||||
if payload.HealthStatus != "healthy" {
|
||||
t.Fatalf("HealthStatus = %q", payload.HealthStatus)
|
||||
}
|
||||
if payload.ServiceStates["workload_supervision"] == "running" {
|
||||
t.Fatal("C3 must not pretend workload supervision is implemented")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshSelfObservationDoesNotEnableTrafficForwarding(t *testing.T) {
|
||||
payload := MeshSelfObservationPayload(state.Identity{NodeID: "node-1"})
|
||||
if payload.SourceNodeID != "node-1" || payload.TargetNodeID != "node-1" {
|
||||
t.Fatalf("unexpected mesh self observation payload: %+v", payload)
|
||||
}
|
||||
if payload.Metadata["traffic_forwarding"] != false {
|
||||
t.Fatalf("traffic forwarding must stay disabled in C6: %+v", payload.Metadata)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
package agent
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
|
||||
)
|
||||
|
||||
func TelemetryPayload(identity state.Identity, startedAt time.Time) client.TelemetryRequest {
|
||||
var mem runtime.MemStats
|
||||
runtime.ReadMemStats(&mem)
|
||||
used := int64(mem.Alloc)
|
||||
total := int64(mem.Sys)
|
||||
processCount := runtime.NumGoroutine()
|
||||
return client.TelemetryRequest{
|
||||
MemoryUsedBytes: &used,
|
||||
MemoryTotalBytes: &total,
|
||||
ProcessCount: &processCount,
|
||||
Payload: map[string]any{
|
||||
"agent": "rap-node-agent",
|
||||
"agent_version": Version,
|
||||
"node_name": identity.NodeName,
|
||||
"os": runtime.GOOS,
|
||||
"arch": runtime.GOARCH,
|
||||
"goroutines": runtime.NumGoroutine(),
|
||||
"uptime_seconds": int64(time.Since(startedAt).Seconds()),
|
||||
"telemetry_source": "testing_flag",
|
||||
},
|
||||
ObservedAt: time.Now().UTC(),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,110 @@
|
||||
package authority
|
||||
|
||||
import (
|
||||
"crypto/ed25519"
|
||||
"crypto/sha256"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
AuthoritySchemaVersion = "rap.cluster_authority.v1"
|
||||
SignatureSchemaVersion = "rap.cluster_authority.signature.v1"
|
||||
AlgorithmEd25519 = "ed25519"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrInvalidKey = errors.New("invalid cluster authority key")
|
||||
ErrInvalidSignature = errors.New("invalid cluster authority signature")
|
||||
ErrInvalidPayload = errors.New("invalid cluster authority payload")
|
||||
)
|
||||
|
||||
type Signature struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
Algorithm string `json:"algorithm"`
|
||||
KeyFingerprint string `json:"key_fingerprint"`
|
||||
Signature string `json:"signature"`
|
||||
}
|
||||
|
||||
func VerifyRaw(publicKeyB64 string, payload json.RawMessage, signature Signature) error {
|
||||
if signature.SchemaVersion != SignatureSchemaVersion {
|
||||
return fmt.Errorf("%w: schema_version must be %s", ErrInvalidSignature, SignatureSchemaVersion)
|
||||
}
|
||||
if signature.Algorithm != AlgorithmEd25519 {
|
||||
return fmt.Errorf("%w: algorithm must be %s", ErrInvalidSignature, AlgorithmEd25519)
|
||||
}
|
||||
publicKey, err := decodePublicKey(publicKeyB64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if signature.KeyFingerprint != Fingerprint(publicKey) {
|
||||
return fmt.Errorf("%w: key fingerprint mismatch", ErrInvalidSignature)
|
||||
}
|
||||
canonical, err := CanonicalJSON(payload)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
decodedSignature, err := decodeBase64(strings.TrimSpace(signature.Signature))
|
||||
if err != nil || len(decodedSignature) != ed25519.SignatureSize {
|
||||
return fmt.Errorf("%w: signature must be base64 ed25519 signature", ErrInvalidSignature)
|
||||
}
|
||||
if !ed25519.Verify(publicKey, canonical, decodedSignature) {
|
||||
return ErrInvalidSignature
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func Fingerprint(publicKey ed25519.PublicKey) string {
|
||||
sum := sha256.Sum256(publicKey)
|
||||
return "rap-ca-ed25519-" + hex.EncodeToString(sum[:16])
|
||||
}
|
||||
|
||||
func HashRaw(raw json.RawMessage) (string, error) {
|
||||
canonical, err := CanonicalJSON(raw)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
sum := sha256.Sum256(canonical)
|
||||
return hex.EncodeToString(sum[:]), nil
|
||||
}
|
||||
|
||||
func CanonicalJSON(raw json.RawMessage) ([]byte, error) {
|
||||
if len(raw) == 0 {
|
||||
return nil, fmt.Errorf("%w: empty payload", ErrInvalidPayload)
|
||||
}
|
||||
var value any
|
||||
if err := json.Unmarshal(raw, &value); err != nil {
|
||||
return nil, fmt.Errorf("%w: invalid json: %v", ErrInvalidPayload, err)
|
||||
}
|
||||
canonical, err := json.Marshal(value)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%w: canonical json: %v", ErrInvalidPayload, err)
|
||||
}
|
||||
return canonical, nil
|
||||
}
|
||||
|
||||
func decodePublicKey(value string) (ed25519.PublicKey, error) {
|
||||
decoded, err := decodeBase64(strings.TrimSpace(value))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%w: public key must be base64 encoded", ErrInvalidKey)
|
||||
}
|
||||
if len(decoded) != ed25519.PublicKeySize {
|
||||
return nil, fmt.Errorf("%w: public key must decode to %d bytes", ErrInvalidKey, ed25519.PublicKeySize)
|
||||
}
|
||||
return ed25519.PublicKey(decoded), nil
|
||||
}
|
||||
|
||||
func decodeBase64(value string) ([]byte, error) {
|
||||
if value == "" {
|
||||
return nil, errors.New("empty base64 value")
|
||||
}
|
||||
decoded, err := base64.StdEncoding.DecodeString(value)
|
||||
if err == nil {
|
||||
return decoded, nil
|
||||
}
|
||||
return base64.RawStdEncoding.DecodeString(value)
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
package authority
|
||||
|
||||
import (
|
||||
"crypto/ed25519"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestVerifyRawAcceptsSignedPayload(t *testing.T) {
|
||||
publicKey, privateKey, err := ed25519.GenerateKey(nil)
|
||||
if err != nil {
|
||||
t.Fatalf("GenerateKey: %v", err)
|
||||
}
|
||||
payload := json.RawMessage(`{"cluster_id":"cluster-1","schema_version":"test.v1"}`)
|
||||
canonical, err := CanonicalJSON(payload)
|
||||
if err != nil {
|
||||
t.Fatalf("CanonicalJSON: %v", err)
|
||||
}
|
||||
signature := Signature{
|
||||
SchemaVersion: SignatureSchemaVersion,
|
||||
Algorithm: AlgorithmEd25519,
|
||||
KeyFingerprint: Fingerprint(publicKey),
|
||||
Signature: base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)),
|
||||
}
|
||||
if err := VerifyRaw(base64.StdEncoding.EncodeToString(publicKey), payload, signature); err != nil {
|
||||
t.Fatalf("VerifyRaw: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestVerifyRawRejectsTamperedPayload(t *testing.T) {
|
||||
publicKey, privateKey, err := ed25519.GenerateKey(nil)
|
||||
if err != nil {
|
||||
t.Fatalf("GenerateKey: %v", err)
|
||||
}
|
||||
payload := json.RawMessage(`{"cluster_id":"cluster-1","schema_version":"test.v1"}`)
|
||||
canonical, err := CanonicalJSON(payload)
|
||||
if err != nil {
|
||||
t.Fatalf("CanonicalJSON: %v", err)
|
||||
}
|
||||
signature := Signature{
|
||||
SchemaVersion: SignatureSchemaVersion,
|
||||
Algorithm: AlgorithmEd25519,
|
||||
KeyFingerprint: Fingerprint(publicKey),
|
||||
Signature: base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)),
|
||||
}
|
||||
tampered := json.RawMessage(`{"cluster_id":"cluster-2","schema_version":"test.v1"}`)
|
||||
if err := VerifyRaw(base64.StdEncoding.EncodeToString(publicKey), tampered, signature); !errors.Is(err, ErrInvalidSignature) {
|
||||
t.Fatalf("err = %v, want ErrInvalidSignature", err)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,400 @@
|
||||
package client
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Client struct {
|
||||
baseURL string
|
||||
httpClient *http.Client
|
||||
}
|
||||
|
||||
type EnrollRequest struct {
|
||||
ClusterID string `json:"cluster_id"`
|
||||
JoinToken string `json:"join_token"`
|
||||
NodeName string `json:"node_name"`
|
||||
NodeFingerprint string `json:"node_fingerprint"`
|
||||
PublicKey string `json:"public_key"`
|
||||
ReportedCapabilities map[string]any `json:"reported_capabilities"`
|
||||
ReportedFacts map[string]any `json:"reported_facts"`
|
||||
RequestedRoles []string `json:"requested_roles"`
|
||||
}
|
||||
|
||||
type EnrollResponse struct {
|
||||
Status string `json:"status"`
|
||||
JoinRequest json.RawMessage `json:"join_request"`
|
||||
}
|
||||
|
||||
type EnrollmentBootstrapRequest struct {
|
||||
ClusterID string `json:"cluster_id"`
|
||||
NodeFingerprint string `json:"node_fingerprint"`
|
||||
PublicKey string `json:"public_key"`
|
||||
}
|
||||
|
||||
type EnrollmentBootstrapResponse struct {
|
||||
Status string `json:"status"`
|
||||
JoinRequest json.RawMessage `json:"join_request"`
|
||||
Bootstrap *NodeBootstrap `json:"node_bootstrap,omitempty"`
|
||||
}
|
||||
|
||||
type NodeBootstrap struct {
|
||||
NodeID string `json:"node_id"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
IdentityStatus string `json:"identity_status"`
|
||||
Certificate map[string]any `json:"certificate"`
|
||||
HeartbeatEndpoint string `json:"heartbeat_endpoint"`
|
||||
ClusterAuthority *ClusterAuthorityDescriptor `json:"cluster_authority,omitempty"`
|
||||
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
|
||||
AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"`
|
||||
}
|
||||
|
||||
type HeartbeatRequest struct {
|
||||
HealthStatus string `json:"health_status"`
|
||||
ReportedVersion string `json:"reported_version,omitempty"`
|
||||
Capabilities map[string]any `json:"capabilities"`
|
||||
ServiceStates map[string]any `json:"service_states"`
|
||||
Metadata map[string]any `json:"metadata"`
|
||||
}
|
||||
|
||||
type HeartbeatResponse struct {
|
||||
Heartbeat json.RawMessage `json:"heartbeat"`
|
||||
TestingFlags EffectiveTestingFlags `json:"testing_flags"`
|
||||
}
|
||||
|
||||
type EffectiveTestingFlags struct {
|
||||
Enabled bool `json:"enabled"`
|
||||
TelemetryEnabled bool `json:"telemetry_enabled"`
|
||||
SyntheticLinksEnabled bool `json:"synthetic_links_enabled"`
|
||||
HistoryRetentionHours int `json:"history_retention_hours"`
|
||||
AppliedScopes []string `json:"applied_scopes"`
|
||||
}
|
||||
|
||||
type DesiredWorkload struct {
|
||||
ServiceType string `json:"service_type"`
|
||||
DesiredState string `json:"desired_state"`
|
||||
Version string `json:"version,omitempty"`
|
||||
RuntimeMode string `json:"runtime_mode"`
|
||||
ArtifactRef string `json:"artifact_ref,omitempty"`
|
||||
Config map[string]any `json:"config"`
|
||||
Environment map[string]any `json:"environment"`
|
||||
}
|
||||
|
||||
type WorkloadStatusRequest struct {
|
||||
ReportedState string `json:"reported_state"`
|
||||
RuntimeMode string `json:"runtime_mode"`
|
||||
Version string `json:"version,omitempty"`
|
||||
StatusPayload map[string]any `json:"status_payload"`
|
||||
}
|
||||
|
||||
type MeshLinkObservationRequest struct {
|
||||
SourceNodeID string `json:"source_node_id"`
|
||||
TargetNodeID string `json:"target_node_id"`
|
||||
LinkStatus string `json:"link_status"`
|
||||
LatencyMs *int `json:"latency_ms,omitempty"`
|
||||
QualityScore *int `json:"quality_score,omitempty"`
|
||||
Metadata map[string]any `json:"metadata"`
|
||||
}
|
||||
|
||||
type TelemetryRequest struct {
|
||||
CPUPercent *float64 `json:"cpu_percent,omitempty"`
|
||||
MemoryUsedBytes *int64 `json:"memory_used_bytes,omitempty"`
|
||||
MemoryTotalBytes *int64 `json:"memory_total_bytes,omitempty"`
|
||||
DiskUsedBytes *int64 `json:"disk_used_bytes,omitempty"`
|
||||
DiskTotalBytes *int64 `json:"disk_total_bytes,omitempty"`
|
||||
NetworkRxBytes *int64 `json:"network_rx_bytes,omitempty"`
|
||||
NetworkTxBytes *int64 `json:"network_tx_bytes,omitempty"`
|
||||
ProcessCount *int `json:"process_count,omitempty"`
|
||||
Payload map[string]any `json:"payload"`
|
||||
ObservedAt time.Time `json:"observed_at"`
|
||||
}
|
||||
|
||||
type SyntheticMeshRouteConfig struct {
|
||||
RouteID string `json:"route_id"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
SourceNodeID string `json:"source_node_id"`
|
||||
DestinationNodeID string `json:"destination_node_id"`
|
||||
Hops []string `json:"hops"`
|
||||
AllowedChannels []string `json:"allowed_channels"`
|
||||
ExpiresAt time.Time `json:"expires_at"`
|
||||
MaxTTL int `json:"max_ttl"`
|
||||
MaxHops int `json:"max_hops"`
|
||||
RouteVersion string `json:"route_version,omitempty"`
|
||||
PolicyVersion string `json:"policy_version,omitempty"`
|
||||
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
|
||||
}
|
||||
|
||||
type SyntheticMeshConfig struct {
|
||||
Enabled bool `json:"enabled"`
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
LocalNodeID string `json:"local_node_id"`
|
||||
AuthorityRequired bool `json:"authority_required"`
|
||||
ClusterAuthority *ClusterAuthorityDescriptor `json:"cluster_authority,omitempty"`
|
||||
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
|
||||
AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"`
|
||||
ConfigVersion string `json:"config_version,omitempty"`
|
||||
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
|
||||
PolicyVersion string `json:"policy_version,omitempty"`
|
||||
PeerEndpoints map[string]string `json:"peer_endpoints"`
|
||||
PeerEndpointCandidates map[string][]PeerEndpointCandidate `json:"peer_endpoint_candidates,omitempty"`
|
||||
PeerDirectory []PeerDirectoryEntry `json:"peer_directory,omitempty"`
|
||||
RecoverySeeds []PeerRecoverySeed `json:"recovery_seeds,omitempty"`
|
||||
RendezvousLeases []PeerRendezvousLease `json:"rendezvous_leases,omitempty"`
|
||||
RendezvousRelayPolicy *RendezvousRelayPolicyReport `json:"rendezvous_relay_policy,omitempty"`
|
||||
RoutePathDecisions *RoutePathDecisionReport `json:"route_path_decisions,omitempty"`
|
||||
Routes []SyntheticMeshRouteConfig `json:"routes"`
|
||||
ProductionForwarding bool `json:"production_forwarding"`
|
||||
}
|
||||
|
||||
type ClusterAuthorityDescriptor struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
AuthorityState string `json:"authority_state"`
|
||||
KeyAlgorithm string `json:"key_algorithm"`
|
||||
PublicKey string `json:"public_key"`
|
||||
PublicKeyFingerprint string `json:"public_key_fingerprint"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type ClusterSignature struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
Algorithm string `json:"algorithm"`
|
||||
KeyFingerprint string `json:"key_fingerprint"`
|
||||
Signature string `json:"signature"`
|
||||
SignedAt time.Time `json:"signed_at"`
|
||||
}
|
||||
|
||||
type PeerDirectoryEntry struct {
|
||||
NodeID string `json:"node_id"`
|
||||
RouteIDs []string `json:"route_ids,omitempty"`
|
||||
EndpointCount int `json:"endpoint_count"`
|
||||
CandidateCount int `json:"candidate_count"`
|
||||
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
|
||||
RecoverySeed bool `json:"recovery_seed"`
|
||||
}
|
||||
|
||||
type PeerRecoverySeed struct {
|
||||
NodeID string `json:"node_id"`
|
||||
Endpoint string `json:"endpoint"`
|
||||
Transport string `json:"transport"`
|
||||
ConnectivityMode string `json:"connectivity_mode,omitempty"`
|
||||
Region string `json:"region,omitempty"`
|
||||
Priority int `json:"priority"`
|
||||
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
|
||||
Metadata json.RawMessage `json:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
type PeerRendezvousLease struct {
|
||||
LeaseID string `json:"lease_id"`
|
||||
PeerNodeID string `json:"peer_node_id"`
|
||||
RelayNodeID string `json:"relay_node_id"`
|
||||
RelayEndpoint string `json:"relay_endpoint"`
|
||||
Transport string `json:"transport"`
|
||||
ConnectivityMode string `json:"connectivity_mode,omitempty"`
|
||||
RouteIDs []string `json:"route_ids,omitempty"`
|
||||
AllowedChannels []string `json:"allowed_channels,omitempty"`
|
||||
Priority int `json:"priority"`
|
||||
ControlPlaneOnly bool `json:"control_plane_only"`
|
||||
IssuedAt time.Time `json:"issued_at"`
|
||||
ExpiresAt time.Time `json:"expires_at"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
Metadata json.RawMessage `json:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
type RendezvousRelayPolicyDecision struct {
|
||||
RouteID string `json:"route_id,omitempty"`
|
||||
PeerNodeID string `json:"peer_node_id"`
|
||||
WithdrawnLeaseID string `json:"withdrawn_lease_id,omitempty"`
|
||||
StaleRelayNodeID string `json:"stale_relay_node_id,omitempty"`
|
||||
SelectedRelayID string `json:"selected_relay_id,omitempty"`
|
||||
SelectedEndpoint string `json:"selected_endpoint,omitempty"`
|
||||
Score int `json:"score,omitempty"`
|
||||
Reason string `json:"reason"`
|
||||
ScoreReasons []string `json:"score_reasons,omitempty"`
|
||||
ReporterNodeID string `json:"reporter_node_id,omitempty"`
|
||||
}
|
||||
|
||||
type RendezvousRelayPolicyReport struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ScoringMode string `json:"scoring_mode"`
|
||||
FeedbackMaxAgeSeconds int `json:"feedback_max_age_seconds"`
|
||||
StaleRelayCount int `json:"stale_relay_count"`
|
||||
WithdrawnLeaseCount int `json:"withdrawn_lease_count"`
|
||||
ReplacementLeaseCount int `json:"replacement_lease_count"`
|
||||
Decisions []RendezvousRelayPolicyDecision `json:"decisions,omitempty"`
|
||||
}
|
||||
|
||||
type RoutePathDecision struct {
|
||||
DecisionID string `json:"decision_id"`
|
||||
RouteID string `json:"route_id"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
LocalNodeID string `json:"local_node_id"`
|
||||
SourceNodeID string `json:"source_node_id"`
|
||||
DestinationNodeID string `json:"destination_node_id"`
|
||||
OriginalHops []string `json:"original_hops"`
|
||||
EffectiveHops []string `json:"effective_hops"`
|
||||
PreviousHopID string `json:"previous_hop_id,omitempty"`
|
||||
NextHopID string `json:"next_hop_id,omitempty"`
|
||||
LocalRole string `json:"local_role"`
|
||||
SelectedRelayID string `json:"selected_relay_id,omitempty"`
|
||||
SelectedRelayEndpoint string `json:"selected_relay_endpoint,omitempty"`
|
||||
StaleRelayNodeID string `json:"stale_relay_node_id,omitempty"`
|
||||
RendezvousPeerNodeID string `json:"rendezvous_peer_node_id,omitempty"`
|
||||
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
|
||||
RendezvousLeaseReason string `json:"rendezvous_lease_reason,omitempty"`
|
||||
DecisionSource string `json:"decision_source"`
|
||||
Generation string `json:"generation"`
|
||||
PathScore int `json:"path_score,omitempty"`
|
||||
ScoreReasons []string `json:"score_reasons,omitempty"`
|
||||
ControlPlaneOnly bool `json:"control_plane_only"`
|
||||
ProductionForwarding bool `json:"production_forwarding"`
|
||||
ExpiresAt time.Time `json:"expires_at"`
|
||||
}
|
||||
|
||||
type RoutePathDecisionReport struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
DecisionMode string `json:"decision_mode"`
|
||||
Generation string `json:"generation"`
|
||||
DecisionCount int `json:"decision_count"`
|
||||
ReplacementDecisionCount int `json:"replacement_decision_count"`
|
||||
ControlPlaneOnly bool `json:"control_plane_only"`
|
||||
ProductionForwarding bool `json:"production_forwarding"`
|
||||
Decisions []RoutePathDecision `json:"decisions,omitempty"`
|
||||
}
|
||||
|
||||
type PeerEndpointCandidate struct {
|
||||
EndpointID string `json:"endpoint_id"`
|
||||
NodeID string `json:"node_id"`
|
||||
Transport string `json:"transport"`
|
||||
Address string `json:"address"`
|
||||
AddressFamily string `json:"address_family,omitempty"`
|
||||
Reachability string `json:"reachability"`
|
||||
NATType string `json:"nat_type,omitempty"`
|
||||
ConnectivityMode string `json:"connectivity_mode"`
|
||||
Region string `json:"region,omitempty"`
|
||||
Priority int `json:"priority"`
|
||||
PolicyTags []string `json:"policy_tags,omitempty"`
|
||||
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
|
||||
Metadata json.RawMessage `json:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
func New(baseURL string) *Client {
|
||||
return &Client{
|
||||
baseURL: baseURL,
|
||||
httpClient: &http.Client{
|
||||
Timeout: 15 * time.Second,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Client) Enroll(ctx context.Context, request EnrollRequest) (EnrollResponse, error) {
|
||||
var response EnrollResponse
|
||||
if err := c.postJSON(ctx, "/node-agents/enroll", request, &response); err != nil {
|
||||
return EnrollResponse{}, err
|
||||
}
|
||||
return response, nil
|
||||
}
|
||||
|
||||
func (c *Client) BootstrapEnrollment(ctx context.Context, joinRequestID string, request EnrollmentBootstrapRequest) (EnrollmentBootstrapResponse, error) {
|
||||
var response EnrollmentBootstrapResponse
|
||||
path := fmt.Sprintf("/node-agents/enrollments/%s/bootstrap", joinRequestID)
|
||||
if err := c.postJSON(ctx, path, request, &response); err != nil {
|
||||
return EnrollmentBootstrapResponse{}, err
|
||||
}
|
||||
return response, nil
|
||||
}
|
||||
|
||||
func (c *Client) Heartbeat(ctx context.Context, clusterID, nodeID string, request HeartbeatRequest) (HeartbeatResponse, error) {
|
||||
var response HeartbeatResponse
|
||||
path := fmt.Sprintf("/clusters/%s/nodes/%s/heartbeats", clusterID, nodeID)
|
||||
if err := c.postJSON(ctx, path, request, &response); err != nil {
|
||||
return HeartbeatResponse{}, err
|
||||
}
|
||||
return response, nil
|
||||
}
|
||||
|
||||
func (c *Client) DesiredWorkloads(ctx context.Context, clusterID, nodeID string) ([]DesiredWorkload, error) {
|
||||
var response struct {
|
||||
DesiredWorkloads []DesiredWorkload `json:"desired_workloads"`
|
||||
}
|
||||
path := fmt.Sprintf("/clusters/%s/nodes/%s/workloads/desired", clusterID, nodeID)
|
||||
if err := c.getJSON(ctx, path, &response); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return response.DesiredWorkloads, nil
|
||||
}
|
||||
|
||||
func (c *Client) ReportWorkloadStatus(ctx context.Context, clusterID, nodeID, serviceType string, request WorkloadStatusRequest) error {
|
||||
path := fmt.Sprintf("/clusters/%s/nodes/%s/workloads/%s/status", clusterID, nodeID, serviceType)
|
||||
return c.postJSON(ctx, path, request, nil)
|
||||
}
|
||||
|
||||
func (c *Client) ReportMeshLink(ctx context.Context, clusterID string, request MeshLinkObservationRequest) error {
|
||||
path := fmt.Sprintf("/clusters/%s/mesh/links", clusterID)
|
||||
return c.postJSON(ctx, path, request, nil)
|
||||
}
|
||||
|
||||
func (c *Client) ReportTelemetry(ctx context.Context, clusterID, nodeID string, request TelemetryRequest) error {
|
||||
path := fmt.Sprintf("/clusters/%s/nodes/%s/telemetry", clusterID, nodeID)
|
||||
return c.postJSON(ctx, path, request, nil)
|
||||
}
|
||||
|
||||
func (c *Client) SyntheticMeshConfig(ctx context.Context, clusterID, nodeID string) (SyntheticMeshConfig, error) {
|
||||
var response struct {
|
||||
Config SyntheticMeshConfig `json:"synthetic_mesh_config"`
|
||||
}
|
||||
path := fmt.Sprintf("/clusters/%s/nodes/%s/mesh/synthetic-config", clusterID, nodeID)
|
||||
if err := c.getJSON(ctx, path, &response); err != nil {
|
||||
return SyntheticMeshConfig{}, err
|
||||
}
|
||||
return response.Config, nil
|
||||
}
|
||||
|
||||
func (c *Client) getJSON(ctx context.Context, path string, response any) error {
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
httpResp, err := c.httpClient.Do(httpReq)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer httpResp.Body.Close()
|
||||
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
|
||||
return fmt.Errorf("backend returned status %d", httpResp.StatusCode)
|
||||
}
|
||||
if response == nil {
|
||||
return nil
|
||||
}
|
||||
return json.NewDecoder(httpResp.Body).Decode(response)
|
||||
}
|
||||
|
||||
func (c *Client) postJSON(ctx context.Context, path string, request any, response any) error {
|
||||
payload, err := json.Marshal(request)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
httpReq.Header.Set("Content-Type", "application/json")
|
||||
httpResp, err := c.httpClient.Do(httpReq)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer httpResp.Body.Close()
|
||||
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
|
||||
return fmt.Errorf("backend returned status %d", httpResp.StatusCode)
|
||||
}
|
||||
if response == nil {
|
||||
return nil
|
||||
}
|
||||
return json.NewDecoder(httpResp.Body).Decode(response)
|
||||
}
|
||||
@@ -0,0 +1,183 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"flag"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const MaxMeshProductionObservationSinkCapacity = 10000
|
||||
|
||||
type Config struct {
|
||||
BackendURL string
|
||||
ClusterID string
|
||||
ClusterAuthorityPublicKey string
|
||||
ClusterAuthorityFingerprint string
|
||||
JoinToken string
|
||||
NodeName string
|
||||
StateDir string
|
||||
WorkloadSupervisionEnabled bool
|
||||
HeartbeatInterval time.Duration
|
||||
EnrollmentPollInterval time.Duration
|
||||
EnrollmentPollTimeout time.Duration
|
||||
MeshSyntheticRuntimeEnabled bool
|
||||
MeshProductionForwardingEnabled bool
|
||||
MeshProductionObservationSinkCapacity int
|
||||
MeshListenAddr string
|
||||
MeshAdvertiseEndpoint string
|
||||
MeshAdvertiseEndpointsJSON string
|
||||
MeshAdvertiseTransport string
|
||||
MeshConnectivityMode string
|
||||
MeshNATType string
|
||||
MeshRegion string
|
||||
MeshSyntheticConfigPath string
|
||||
MeshPeerEndpointsJSON string
|
||||
MeshSyntheticRoutesJSON string
|
||||
}
|
||||
|
||||
func Load(args []string, env map[string]string) (Config, error) {
|
||||
if env == nil {
|
||||
env = readEnv()
|
||||
}
|
||||
defaultStateDir := filepath.Join(".", ".rap-node-agent")
|
||||
fs := flag.NewFlagSet("rap-node-agent", flag.ContinueOnError)
|
||||
cfg := Config{}
|
||||
fs.StringVar(&cfg.BackendURL, "backend-url", getEnv(env, "RAP_BACKEND_URL", "http://127.0.0.1:8080/api/v1"), "Backend API base URL.")
|
||||
fs.StringVar(&cfg.ClusterID, "cluster-id", getEnv(env, "RAP_CLUSTER_ID", ""), "Cluster ID.")
|
||||
fs.StringVar(&cfg.ClusterAuthorityPublicKey, "cluster-authority-public-key", getEnv(env, "RAP_CLUSTER_AUTHORITY_PUBLIC_KEY", ""), "Pinned cluster authority Ed25519 public key.")
|
||||
fs.StringVar(&cfg.ClusterAuthorityFingerprint, "cluster-authority-fingerprint", getEnv(env, "RAP_CLUSTER_AUTHORITY_FINGERPRINT", ""), "Pinned cluster authority key fingerprint.")
|
||||
fs.StringVar(&cfg.JoinToken, "join-token", getEnv(env, "RAP_JOIN_TOKEN", ""), "Short-lived node join token.")
|
||||
fs.StringVar(&cfg.NodeName, "node-name", getEnv(env, "RAP_NODE_NAME", hostnameOrDefault()), "Node display name.")
|
||||
fs.StringVar(&cfg.StateDir, "state-dir", getEnv(env, "RAP_NODE_STATE_DIR", defaultStateDir), "Local node-agent state directory.")
|
||||
fs.BoolVar(&cfg.WorkloadSupervisionEnabled, "workload-supervision-enabled", getEnvBool(env, "RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable desired workload polling and status reporting. Disabled by default while service runtime is not implemented.")
|
||||
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getEnvBool(env, "RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable C17A synthetic fabric probe runtime. Disabled by default.")
|
||||
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getEnvBool(env, "RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production fabric-control direct next-hop forwarding gate. Disabled by default.")
|
||||
fs.IntVar(&cfg.MeshProductionObservationSinkCapacity, "mesh-production-observation-sink-capacity", getEnvSignedInt(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Bounded local metadata-only production envelope observation sink capacity. Disabled when 0.")
|
||||
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getEnv(env, "RAP_MESH_LISTEN_ADDR", ""), "Listen address for disabled-by-default C17E synthetic mesh HTTP endpoint.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint reported to the Control Plane. Empty disables endpoint reporting.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "JSON array of advertised mesh endpoint candidates, including private/corporate endpoints.")
|
||||
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getEnv(env, "RAP_MESH_ADVERTISE_TRANSPORT", "direct_tcp_tls"), "Transport label for the advertised mesh endpoint.")
|
||||
fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getEnv(env, "RAP_MESH_CONNECTIVITY_MODE", "direct"), "Connectivity mode reported with the advertised mesh endpoint.")
|
||||
fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getEnv(env, "RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint reported with the advertised mesh endpoint.")
|
||||
fs.StringVar(&cfg.MeshRegion, "mesh-region", getEnv(env, "RAP_MESH_REGION", ""), "Optional region/site hint for the advertised mesh endpoint.")
|
||||
fs.StringVar(&cfg.MeshSyntheticConfigPath, "mesh-synthetic-config", getEnv(env, "RAP_MESH_SYNTHETIC_CONFIG", ""), "Path to scoped synthetic mesh config snapshot. Preferred over debug JSON env.")
|
||||
fs.StringVar(&cfg.MeshPeerEndpointsJSON, "mesh-peer-endpoints-json", getEnv(env, "RAP_MESH_PEER_ENDPOINTS_JSON", ""), "JSON object mapping peer node_id to synthetic mesh endpoint URL.")
|
||||
fs.StringVar(&cfg.MeshSyntheticRoutesJSON, "mesh-synthetic-routes-json", getEnv(env, "RAP_MESH_SYNTHETIC_ROUTES_JSON", ""), "JSON array of synthetic mesh routes for test-only runtime.")
|
||||
heartbeatSeconds := getEnvInt(env, "RAP_HEARTBEAT_INTERVAL_SECONDS", 15)
|
||||
fs.DurationVar(&cfg.HeartbeatInterval, "heartbeat-interval", time.Duration(heartbeatSeconds)*time.Second, "Heartbeat interval.")
|
||||
enrollmentPollIntervalSeconds := getEnvInt(env, "RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5)
|
||||
enrollmentPollTimeoutSeconds := getEnvInt(env, "RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 600)
|
||||
fs.DurationVar(&cfg.EnrollmentPollInterval, "enrollment-poll-interval", time.Duration(enrollmentPollIntervalSeconds)*time.Second, "Enrollment approval polling interval.")
|
||||
fs.DurationVar(&cfg.EnrollmentPollTimeout, "enrollment-poll-timeout", time.Duration(enrollmentPollTimeoutSeconds)*time.Second, "Enrollment approval polling timeout.")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
cfg.BackendURL = strings.TrimRight(strings.TrimSpace(cfg.BackendURL), "/")
|
||||
cfg.ClusterID = strings.TrimSpace(cfg.ClusterID)
|
||||
cfg.ClusterAuthorityPublicKey = strings.TrimSpace(cfg.ClusterAuthorityPublicKey)
|
||||
cfg.ClusterAuthorityFingerprint = strings.TrimSpace(cfg.ClusterAuthorityFingerprint)
|
||||
cfg.JoinToken = strings.TrimSpace(cfg.JoinToken)
|
||||
cfg.NodeName = strings.TrimSpace(cfg.NodeName)
|
||||
cfg.StateDir = strings.TrimSpace(cfg.StateDir)
|
||||
cfg.MeshListenAddr = strings.TrimSpace(cfg.MeshListenAddr)
|
||||
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
|
||||
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
|
||||
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
|
||||
cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode)
|
||||
cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType)
|
||||
cfg.MeshRegion = strings.TrimSpace(cfg.MeshRegion)
|
||||
cfg.MeshSyntheticConfigPath = strings.TrimSpace(cfg.MeshSyntheticConfigPath)
|
||||
cfg.MeshPeerEndpointsJSON = strings.TrimSpace(cfg.MeshPeerEndpointsJSON)
|
||||
cfg.MeshSyntheticRoutesJSON = strings.TrimSpace(cfg.MeshSyntheticRoutesJSON)
|
||||
if cfg.BackendURL == "" {
|
||||
return Config{}, errors.New("backend URL is required")
|
||||
}
|
||||
if cfg.NodeName == "" {
|
||||
return Config{}, errors.New("node name is required")
|
||||
}
|
||||
if cfg.StateDir == "" {
|
||||
return Config{}, errors.New("state dir is required")
|
||||
}
|
||||
if cfg.HeartbeatInterval <= 0 {
|
||||
return Config{}, errors.New("heartbeat interval must be positive")
|
||||
}
|
||||
if cfg.EnrollmentPollInterval <= 0 {
|
||||
return Config{}, errors.New("enrollment poll interval must be positive")
|
||||
}
|
||||
if cfg.EnrollmentPollTimeout < 0 {
|
||||
return Config{}, errors.New("enrollment poll timeout must not be negative")
|
||||
}
|
||||
if cfg.MeshProductionObservationSinkCapacity < 0 {
|
||||
return Config{}, errors.New("mesh production observation sink capacity must not be negative")
|
||||
}
|
||||
if cfg.MeshProductionObservationSinkCapacity > MaxMeshProductionObservationSinkCapacity {
|
||||
return Config{}, errors.New("mesh production observation sink capacity exceeds maximum")
|
||||
}
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
func readEnv() map[string]string {
|
||||
out := map[string]string{}
|
||||
for _, pair := range os.Environ() {
|
||||
key, value, ok := strings.Cut(pair, "=")
|
||||
if ok {
|
||||
out[key] = value
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func getEnv(env map[string]string, key, fallback string) string {
|
||||
if value := strings.TrimSpace(env[key]); value != "" {
|
||||
return value
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
func getEnvInt(env map[string]string, key string, fallback int) int {
|
||||
value := strings.TrimSpace(env[key])
|
||||
if value == "" {
|
||||
return fallback
|
||||
}
|
||||
parsed, err := strconv.Atoi(value)
|
||||
if err != nil || parsed <= 0 {
|
||||
return fallback
|
||||
}
|
||||
return parsed
|
||||
}
|
||||
|
||||
func getEnvSignedInt(env map[string]string, key string, fallback int) int {
|
||||
value := strings.TrimSpace(env[key])
|
||||
if value == "" {
|
||||
return fallback
|
||||
}
|
||||
parsed, err := strconv.Atoi(value)
|
||||
if err != nil {
|
||||
return fallback
|
||||
}
|
||||
return parsed
|
||||
}
|
||||
|
||||
func getEnvBool(env map[string]string, key string, fallback bool) bool {
|
||||
value := strings.ToLower(strings.TrimSpace(env[key]))
|
||||
switch value {
|
||||
case "1", "true", "yes", "y", "on":
|
||||
return true
|
||||
case "0", "false", "no", "n", "off":
|
||||
return false
|
||||
default:
|
||||
return fallback
|
||||
}
|
||||
}
|
||||
|
||||
func hostnameOrDefault() string {
|
||||
host, err := os.Hostname()
|
||||
if err != nil || strings.TrimSpace(host) == "" {
|
||||
return "rap-node"
|
||||
}
|
||||
return host
|
||||
}
|
||||
@@ -0,0 +1,104 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestLoadConfigFromEnvAndArgs(t *testing.T) {
|
||||
cfg, err := Load([]string{"-node-name", "node-b"}, map[string]string{
|
||||
"RAP_BACKEND_URL": "http://backend/api/v1/",
|
||||
"RAP_CLUSTER_ID": "cluster-1",
|
||||
"RAP_CLUSTER_AUTHORITY_PUBLIC_KEY": "public-key-b64",
|
||||
"RAP_CLUSTER_AUTHORITY_FINGERPRINT": "rap-ca-ed25519-test",
|
||||
"RAP_JOIN_TOKEN": "join-token",
|
||||
"RAP_NODE_NAME": "node-a",
|
||||
"RAP_NODE_STATE_DIR": "/tmp/rap-node",
|
||||
"RAP_WORKLOAD_SUPERVISION_ENABLED": "true",
|
||||
"RAP_HEARTBEAT_INTERVAL_SECONDS": "7",
|
||||
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS": "3",
|
||||
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS": "30",
|
||||
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED": "true",
|
||||
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED": "true",
|
||||
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "5",
|
||||
"RAP_MESH_LISTEN_ADDR": "127.0.0.1:19001",
|
||||
"RAP_MESH_ADVERTISE_ENDPOINT": "https://node-a.example.test:443/",
|
||||
"RAP_MESH_ADVERTISE_ENDPOINTS_JSON": `[{"endpoint_id":"node-a-lan","address":"10.10.0.20:19001"}]`,
|
||||
"RAP_MESH_ADVERTISE_TRANSPORT": "wss",
|
||||
"RAP_MESH_CONNECTIVITY_MODE": "outbound_only",
|
||||
"RAP_MESH_NAT_TYPE": "symmetric",
|
||||
"RAP_MESH_REGION": "eu",
|
||||
"RAP_MESH_SYNTHETIC_CONFIG": "/tmp/rap-node/mesh-synthetic.json",
|
||||
"RAP_MESH_PEER_ENDPOINTS_JSON": `{"node-b":"http://127.0.0.1:19002"}`,
|
||||
"RAP_MESH_SYNTHETIC_ROUTES_JSON": `[{"route_id":"route-1"}]`,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("load config: %v", err)
|
||||
}
|
||||
if cfg.BackendURL != "http://backend/api/v1" {
|
||||
t.Fatalf("BackendURL = %q", cfg.BackendURL)
|
||||
}
|
||||
if cfg.NodeName != "node-b" {
|
||||
t.Fatalf("NodeName = %q", cfg.NodeName)
|
||||
}
|
||||
if cfg.ClusterAuthorityPublicKey != "public-key-b64" || cfg.ClusterAuthorityFingerprint != "rap-ca-ed25519-test" {
|
||||
t.Fatalf("unexpected cluster authority pin config: %+v", cfg)
|
||||
}
|
||||
if cfg.HeartbeatInterval != 7*time.Second {
|
||||
t.Fatalf("HeartbeatInterval = %s", cfg.HeartbeatInterval)
|
||||
}
|
||||
if cfg.EnrollmentPollInterval != 3*time.Second || cfg.EnrollmentPollTimeout != 30*time.Second {
|
||||
t.Fatalf("unexpected enrollment polling config: %+v", cfg)
|
||||
}
|
||||
if !cfg.WorkloadSupervisionEnabled {
|
||||
t.Fatal("WorkloadSupervisionEnabled = false, want true")
|
||||
}
|
||||
if !cfg.MeshSyntheticRuntimeEnabled {
|
||||
t.Fatal("MeshSyntheticRuntimeEnabled = false, want true")
|
||||
}
|
||||
if !cfg.MeshProductionForwardingEnabled {
|
||||
t.Fatal("MeshProductionForwardingEnabled = false, want true")
|
||||
}
|
||||
if cfg.MeshProductionObservationSinkCapacity != 5 {
|
||||
t.Fatalf("MeshProductionObservationSinkCapacity = %d, want 5", cfg.MeshProductionObservationSinkCapacity)
|
||||
}
|
||||
if cfg.MeshListenAddr != "127.0.0.1:19001" {
|
||||
t.Fatalf("MeshListenAddr = %q", cfg.MeshListenAddr)
|
||||
}
|
||||
if cfg.MeshAdvertiseEndpoint != "https://node-a.example.test:443" ||
|
||||
cfg.MeshAdvertiseEndpointsJSON == "" ||
|
||||
cfg.MeshAdvertiseTransport != "wss" ||
|
||||
cfg.MeshConnectivityMode != "outbound_only" ||
|
||||
cfg.MeshNATType != "symmetric" ||
|
||||
cfg.MeshRegion != "eu" {
|
||||
t.Fatalf("unexpected mesh advertise config: %+v", cfg)
|
||||
}
|
||||
if cfg.MeshSyntheticConfigPath != "/tmp/rap-node/mesh-synthetic.json" {
|
||||
t.Fatalf("MeshSyntheticConfigPath = %q", cfg.MeshSyntheticConfigPath)
|
||||
}
|
||||
if cfg.MeshPeerEndpointsJSON == "" || cfg.MeshSyntheticRoutesJSON == "" {
|
||||
t.Fatalf("mesh live synthetic config was not loaded: %+v", cfg)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadConfigRejectsNegativeProductionObservationSinkCapacity(t *testing.T) {
|
||||
_, err := Load(nil, map[string]string{
|
||||
"RAP_BACKEND_URL": "http://backend/api/v1",
|
||||
"RAP_NODE_NAME": "node-a",
|
||||
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "-1",
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("Load returned nil error for negative sink capacity")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadConfigRejectsTooLargeProductionObservationSinkCapacity(t *testing.T) {
|
||||
_, err := Load(nil, map[string]string{
|
||||
"RAP_BACKEND_URL": "http://backend/api/v1",
|
||||
"RAP_NODE_NAME": "node-a",
|
||||
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "10001",
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatal("Load returned nil error for too-large sink capacity")
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,111 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Client struct {
|
||||
BaseURL string
|
||||
HTTPClient *http.Client
|
||||
}
|
||||
|
||||
func NewClient(baseURL string) Client {
|
||||
return Client{
|
||||
BaseURL: baseURL,
|
||||
HTTPClient: &http.Client{
|
||||
Timeout: 5 * time.Second,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (c Client) SendHealth(ctx context.Context, message HealthMessage) (HealthAck, error) {
|
||||
payload, err := json.Marshal(message)
|
||||
if err != nil {
|
||||
return HealthAck{}, err
|
||||
}
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/health", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
return HealthAck{}, err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
httpClient := c.HTTPClient
|
||||
if httpClient == nil {
|
||||
httpClient = http.DefaultClient
|
||||
}
|
||||
resp, err := httpClient.Do(req)
|
||||
if err != nil {
|
||||
return HealthAck{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return HealthAck{}, fmt.Errorf("mesh health rejected with status %d", resp.StatusCode)
|
||||
}
|
||||
var ack HealthAck
|
||||
if err := json.NewDecoder(resp.Body).Decode(&ack); err != nil {
|
||||
return HealthAck{}, err
|
||||
}
|
||||
return ack, nil
|
||||
}
|
||||
|
||||
func (c Client) SendSynthetic(ctx context.Context, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
return SyntheticEnvelope{}, err
|
||||
}
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/synthetic/probe", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
return SyntheticEnvelope{}, err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
httpClient := c.HTTPClient
|
||||
if httpClient == nil {
|
||||
httpClient = http.DefaultClient
|
||||
}
|
||||
resp, err := httpClient.Do(req)
|
||||
if err != nil {
|
||||
return SyntheticEnvelope{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return SyntheticEnvelope{}, fmt.Errorf("mesh synthetic probe rejected with status %d", resp.StatusCode)
|
||||
}
|
||||
var ack SyntheticEnvelope
|
||||
if err := json.NewDecoder(resp.Body).Decode(&ack); err != nil {
|
||||
return SyntheticEnvelope{}, err
|
||||
}
|
||||
return ack, nil
|
||||
}
|
||||
|
||||
func (c Client) SendProduction(ctx context.Context, envelope ProductionEnvelope) (ProductionForwardResult, error) {
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
return ProductionForwardResult{}, err
|
||||
}
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/forward", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
return ProductionForwardResult{}, err
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
httpClient := c.HTTPClient
|
||||
if httpClient == nil {
|
||||
httpClient = http.DefaultClient
|
||||
}
|
||||
resp, err := httpClient.Do(req)
|
||||
if err != nil {
|
||||
return ProductionForwardResult{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return ProductionForwardResult{}, fmt.Errorf("mesh production forward rejected with status %d", resp.StatusCode)
|
||||
}
|
||||
var result ProductionForwardResult
|
||||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
||||
return ProductionForwardResult{}, err
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
@@ -0,0 +1,288 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"time"
|
||||
)
|
||||
|
||||
const ProtocolVersion = "mesh-control-v1"
|
||||
|
||||
var (
|
||||
ErrClusterMismatch = errors.New("mesh peer cluster mismatch")
|
||||
ErrNodeMismatch = errors.New("mesh peer node mismatch")
|
||||
ErrForwardDisabled = errors.New("production payload forwarding is disabled by mesh production gate")
|
||||
ErrForwardRuntimeUnavailable = errors.New("production mesh forwarding runtime is unavailable for this route or stage")
|
||||
ErrForwardPeerUnavailable = errors.New("production mesh next peer is unavailable")
|
||||
ErrForwardEnvelopeInvalid = errors.New("production mesh envelope is invalid")
|
||||
ErrForwardObservationFailed = errors.New("production mesh envelope observation failed")
|
||||
ErrMeshRuntimeDisabled = errors.New("mesh synthetic runtime is disabled")
|
||||
ErrUnsupportedSyntheticMessage = errors.New("unsupported synthetic mesh message")
|
||||
ErrRouteIDRequired = errors.New("mesh synthetic route id is required")
|
||||
ErrRouteNotFound = errors.New("mesh synthetic route not found")
|
||||
ErrInvalidRoutePath = errors.New("mesh synthetic route path is invalid")
|
||||
ErrRouteExpired = errors.New("mesh synthetic route is expired")
|
||||
ErrTTLExhausted = errors.New("mesh synthetic route ttl exhausted")
|
||||
ErrLoopDetected = errors.New("mesh synthetic route loop detected")
|
||||
ErrUnauthorizedChannel = errors.New("mesh synthetic channel is not authorized")
|
||||
ErrSyntheticPeerUnavailable = errors.New("mesh synthetic next peer is unavailable")
|
||||
ErrNoHealthySyntheticRoute = errors.New("mesh synthetic no healthy route available")
|
||||
ErrSyntheticRelayQueueFull = errors.New("mesh synthetic relay queue is full")
|
||||
ErrSyntheticRelayQueueEmpty = errors.New("mesh synthetic relay queue is empty")
|
||||
ErrSyntheticPayloadTooLarge = errors.New("mesh synthetic payload is too large")
|
||||
ErrSyntheticOrganizationMismatch = errors.New("mesh synthetic organization mismatch")
|
||||
ErrUnsupportedSyntheticService = errors.New("unsupported synthetic test service")
|
||||
ErrSyntheticRequestInvalid = errors.New("mesh synthetic request is invalid")
|
||||
)
|
||||
|
||||
const (
|
||||
SyntheticMessageProbe = "fabric.probe"
|
||||
SyntheticMessageProbeAck = "fabric.probe_ack"
|
||||
SyntheticMessageRouteHealth = "fabric.route_health"
|
||||
SyntheticMessageRouteHealthAck = "fabric.route_health_ack"
|
||||
SyntheticMessageTelemetry = "fabric.telemetry"
|
||||
SyntheticMessageTestService = "fabric.test_service"
|
||||
SyntheticMessageTestServiceAck = "fabric.test_service_ack"
|
||||
|
||||
SyntheticTestServiceType = "synthetic.echo"
|
||||
SyntheticDefaultTestOrganizationID = "org-test"
|
||||
SyntheticDefaultMaxTestPayloadBytes = 4096
|
||||
|
||||
SyntheticChannelFabricControl = "fabric_control"
|
||||
SyntheticChannelRouteControl = "route_control"
|
||||
SyntheticChannelTelemetry = "telemetry"
|
||||
|
||||
SyntheticRouteStateUnknown = "unknown"
|
||||
SyntheticRouteStateHealthy = "healthy"
|
||||
SyntheticRouteStateDegraded = "degraded"
|
||||
SyntheticRouteStateFailed = "failed"
|
||||
|
||||
ProductionChannelFabricControl = "fabric_control"
|
||||
ProductionMessageFabricControl = "fabric.control"
|
||||
MaxProductionEnvelopePayloadBytes = 4096
|
||||
MaxProductionEnvelopeFutureSkew = time.Minute
|
||||
)
|
||||
|
||||
type PeerIdentity struct {
|
||||
ClusterID string `json:"cluster_id"`
|
||||
NodeID string `json:"node_id"`
|
||||
}
|
||||
|
||||
type SyntheticRoute struct {
|
||||
RouteID string `json:"route_id"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
SourceNodeID string `json:"source_node_id"`
|
||||
DestinationNodeID string `json:"destination_node_id"`
|
||||
Hops []string `json:"hops"`
|
||||
AllowedChannels []string `json:"allowed_channels"`
|
||||
ExpiresAt time.Time `json:"expires_at"`
|
||||
MaxTTL int `json:"max_ttl"`
|
||||
MaxHops int `json:"max_hops"`
|
||||
RouteVersion string `json:"route_version,omitempty"`
|
||||
PolicyVersion string `json:"policy_version,omitempty"`
|
||||
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
|
||||
}
|
||||
|
||||
type SyntheticEnvelope struct {
|
||||
ProtocolVersion string `json:"protocol_version"`
|
||||
RouteID string `json:"route_id"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
From PeerIdentity `json:"from"`
|
||||
To PeerIdentity `json:"to"`
|
||||
Channel string `json:"channel"`
|
||||
MessageType string `json:"message_type"`
|
||||
TTL int `json:"ttl"`
|
||||
HopCount int `json:"hop_count"`
|
||||
Visited []string `json:"visited"`
|
||||
Sequence uint64 `json:"sequence"`
|
||||
SentAt time.Time `json:"sent_at"`
|
||||
Payload json.RawMessage `json:"payload,omitempty"`
|
||||
}
|
||||
|
||||
type SyntheticProbePayload struct {
|
||||
ProbeID string `json:"probe_id"`
|
||||
SentAt time.Time `json:"sent_at"`
|
||||
}
|
||||
|
||||
type SyntheticProbeAckPayload struct {
|
||||
ProbeID string `json:"probe_id"`
|
||||
Path []string `json:"path"`
|
||||
AcceptedAt time.Time `json:"accepted_at"`
|
||||
}
|
||||
|
||||
type SyntheticRouteObservation struct {
|
||||
RouteID string `json:"route_id"`
|
||||
State string `json:"state"`
|
||||
LastSuccessAt time.Time `json:"last_success_at,omitempty"`
|
||||
LastFailureAt time.Time `json:"last_failure_at,omitempty"`
|
||||
LastFailureReason string `json:"last_failure_reason,omitempty"`
|
||||
SuccessCount uint64 `json:"success_count"`
|
||||
FailureCount uint64 `json:"failure_count"`
|
||||
LastLatencyMs int64 `json:"last_latency_ms,omitempty"`
|
||||
RouteVersion string `json:"route_version,omitempty"`
|
||||
PolicyVersion string `json:"policy_version,omitempty"`
|
||||
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
|
||||
}
|
||||
|
||||
type SyntheticRouteHealthResult struct {
|
||||
RequestedRouteID string `json:"requested_route_id"`
|
||||
SelectedRouteID string `json:"selected_route_id"`
|
||||
FallbackUsed bool `json:"fallback_used"`
|
||||
Ack SyntheticEnvelope `json:"ack"`
|
||||
Observation SyntheticRouteObservation `json:"observation"`
|
||||
}
|
||||
|
||||
type SyntheticTestServiceRequest struct {
|
||||
RequestID string `json:"request_id"`
|
||||
OrganizationID string `json:"organization_id"`
|
||||
ServiceType string `json:"service_type"`
|
||||
Payload string `json:"payload"`
|
||||
SentAt time.Time `json:"sent_at"`
|
||||
}
|
||||
|
||||
type SyntheticTestServiceResponse struct {
|
||||
RequestID string `json:"request_id"`
|
||||
OrganizationID string `json:"organization_id"`
|
||||
ServiceType string `json:"service_type"`
|
||||
EchoPayload string `json:"echo_payload"`
|
||||
Path []string `json:"path"`
|
||||
AcceptedAt time.Time `json:"accepted_at"`
|
||||
}
|
||||
|
||||
type SyntheticTestServiceResult struct {
|
||||
RequestedRouteID string `json:"requested_route_id"`
|
||||
SelectedRouteID string `json:"selected_route_id"`
|
||||
FallbackUsed bool `json:"fallback_used"`
|
||||
Ack SyntheticEnvelope `json:"ack"`
|
||||
Response SyntheticTestServiceResponse `json:"response"`
|
||||
Observation SyntheticRouteObservation `json:"observation"`
|
||||
}
|
||||
|
||||
type SyntheticRouteCacheVersion struct {
|
||||
RouteVersion string `json:"route_version,omitempty"`
|
||||
PolicyVersion string `json:"policy_version,omitempty"`
|
||||
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
|
||||
}
|
||||
|
||||
type SyntheticRelayQueuePolicy struct {
|
||||
Channel string `json:"channel"`
|
||||
Capacity int `json:"capacity"`
|
||||
Droppable bool `json:"droppable"`
|
||||
}
|
||||
|
||||
type SyntheticRelayEnqueueResult struct {
|
||||
Channel string `json:"channel"`
|
||||
QueueDepth int `json:"queue_depth"`
|
||||
QueueCapacity int `json:"queue_capacity"`
|
||||
Dropped bool `json:"dropped"`
|
||||
DroppedSequence uint64 `json:"dropped_sequence,omitempty"`
|
||||
AcceptedSequence uint64 `json:"accepted_sequence"`
|
||||
}
|
||||
|
||||
type SyntheticRelayQueueMetrics struct {
|
||||
Enqueued uint64 `json:"enqueued"`
|
||||
Dequeued uint64 `json:"dequeued"`
|
||||
Dropped uint64 `json:"dropped"`
|
||||
Rejected uint64 `json:"rejected"`
|
||||
LastRejectReason string `json:"last_reject_reason,omitempty"`
|
||||
QueueDepths map[string]int `json:"queue_depths"`
|
||||
}
|
||||
|
||||
type HealthMessage struct {
|
||||
ProtocolVersion string `json:"protocol_version"`
|
||||
From PeerIdentity `json:"from"`
|
||||
To PeerIdentity `json:"to"`
|
||||
ObservedAt time.Time `json:"observed_at"`
|
||||
LinkStatus string `json:"link_status"`
|
||||
LatencyMs *int `json:"latency_ms,omitempty"`
|
||||
QualityScore *int `json:"quality_score,omitempty"`
|
||||
}
|
||||
|
||||
type HealthAck struct {
|
||||
ProtocolVersion string `json:"protocol_version"`
|
||||
Accepted bool `json:"accepted"`
|
||||
By PeerIdentity `json:"by"`
|
||||
}
|
||||
|
||||
type ProductionEnvelope struct {
|
||||
FabricProtocolVersion string `json:"fabric_protocol_version"`
|
||||
MessageID string `json:"message_id"`
|
||||
RouteID string `json:"route_id"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
SourceNodeID string `json:"source_node_id"`
|
||||
DestinationNodeID string `json:"destination_node_id"`
|
||||
CurrentHopNodeID string `json:"current_hop_node_id"`
|
||||
NextHopNodeID string `json:"next_hop_node_id"`
|
||||
RoutePath []string `json:"route_path,omitempty"`
|
||||
VisitedNodeIDs []string `json:"visited_node_ids,omitempty"`
|
||||
ChannelClass string `json:"channel_class"`
|
||||
MessageType string `json:"message_type"`
|
||||
TTL int `json:"ttl"`
|
||||
HopCount int `json:"hop_count"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
ExpiresAt time.Time `json:"expires_at"`
|
||||
PayloadLength int `json:"payload_length"`
|
||||
PayloadHash string `json:"payload_hash"`
|
||||
Payload json.RawMessage `json:"payload,omitempty"`
|
||||
}
|
||||
|
||||
type ProductionEnvelopeObservation struct {
|
||||
MessageID string `json:"message_id"`
|
||||
RouteID string `json:"route_id"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
SourceNodeID string `json:"source_node_id"`
|
||||
DestinationNodeID string `json:"destination_node_id"`
|
||||
CurrentHopNodeID string `json:"current_hop_node_id"`
|
||||
NextHopNodeID string `json:"next_hop_node_id"`
|
||||
RoutePath []string `json:"route_path,omitempty"`
|
||||
VisitedNodeIDs []string `json:"visited_node_ids,omitempty"`
|
||||
ChannelClass string `json:"channel_class"`
|
||||
MessageType string `json:"message_type"`
|
||||
TTL int `json:"ttl"`
|
||||
HopCount int `json:"hop_count"`
|
||||
PayloadLength int `json:"payload_length"`
|
||||
PayloadHash string `json:"payload_hash"`
|
||||
ObservedAt time.Time `json:"observed_at"`
|
||||
}
|
||||
|
||||
type ProductionForwardResult struct {
|
||||
Accepted bool `json:"accepted"`
|
||||
Delivered bool `json:"delivered"`
|
||||
Forwarded bool `json:"forwarded"`
|
||||
By PeerIdentity `json:"by"`
|
||||
MessageID string `json:"message_id"`
|
||||
RouteID string `json:"route_id"`
|
||||
NextNodeID string `json:"next_node_id,omitempty"`
|
||||
}
|
||||
|
||||
type ProductionForwardLogEntry struct {
|
||||
Event string `json:"event"`
|
||||
RouteID string `json:"route_id,omitempty"`
|
||||
MessageID string `json:"message_id,omitempty"`
|
||||
ClusterID string `json:"cluster_id,omitempty"`
|
||||
LocalNodeID string `json:"local_node_id,omitempty"`
|
||||
SourceNodeID string `json:"source_node_id,omitempty"`
|
||||
DestinationNodeID string `json:"destination_node_id,omitempty"`
|
||||
CurrentHopNodeID string `json:"current_hop_node_id,omitempty"`
|
||||
NextHopNodeID string `json:"next_hop_node_id,omitempty"`
|
||||
ChannelClass string `json:"channel_class,omitempty"`
|
||||
MessageType string `json:"message_type,omitempty"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
StatusCode int `json:"status_code,omitempty"`
|
||||
TTL int `json:"ttl,omitempty"`
|
||||
HopCount int `json:"hop_count,omitempty"`
|
||||
RoutePathLength int `json:"route_path_length,omitempty"`
|
||||
VisitedCount int `json:"visited_count,omitempty"`
|
||||
PayloadLength int `json:"payload_length,omitempty"`
|
||||
OccurredAt time.Time `json:"occurred_at"`
|
||||
}
|
||||
|
||||
func ValidatePeer(local PeerIdentity, remote PeerIdentity) error {
|
||||
if local.ClusterID == "" || remote.ClusterID == "" || local.ClusterID != remote.ClusterID {
|
||||
return ErrClusterMismatch
|
||||
}
|
||||
if remote.NodeID == "" {
|
||||
return ErrNodeMismatch
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,258 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type EndpointCandidateScoreOptions struct {
|
||||
ChannelClass string
|
||||
PreferredRegion string
|
||||
Now time.Time
|
||||
MaxVerificationAge time.Duration
|
||||
Observations map[string]EndpointCandidateHealthObservation
|
||||
MaxObservationAge time.Duration
|
||||
}
|
||||
|
||||
type EndpointCandidateHealthObservation struct {
|
||||
EndpointID string `json:"endpoint_id"`
|
||||
LastLatencyMs int64 `json:"last_latency_ms,omitempty"`
|
||||
SuccessCount uint64 `json:"success_count,omitempty"`
|
||||
FailureCount uint64 `json:"failure_count,omitempty"`
|
||||
LastFailureReason string `json:"last_failure_reason,omitempty"`
|
||||
ReliabilityScore int `json:"reliability_score,omitempty"`
|
||||
ObservedAt time.Time `json:"observed_at,omitempty"`
|
||||
}
|
||||
|
||||
type ScoredPeerEndpointCandidate struct {
|
||||
Candidate PeerEndpointCandidate `json:"candidate"`
|
||||
Score int `json:"score"`
|
||||
Reasons []string `json:"reasons,omitempty"`
|
||||
}
|
||||
|
||||
func RankPeerEndpointCandidates(candidates []PeerEndpointCandidate, opts EndpointCandidateScoreOptions) []ScoredPeerEndpointCandidate {
|
||||
if len(candidates) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make([]ScoredPeerEndpointCandidate, 0, len(candidates))
|
||||
for _, candidate := range candidates {
|
||||
out = append(out, scorePeerEndpointCandidate(candidate, opts))
|
||||
}
|
||||
sort.SliceStable(out, func(i, j int) bool {
|
||||
if out[i].Score != out[j].Score {
|
||||
return out[i].Score > out[j].Score
|
||||
}
|
||||
if out[i].Candidate.Priority != out[j].Candidate.Priority {
|
||||
return out[i].Candidate.Priority < out[j].Candidate.Priority
|
||||
}
|
||||
if out[i].Candidate.NodeID != out[j].Candidate.NodeID {
|
||||
return out[i].Candidate.NodeID < out[j].Candidate.NodeID
|
||||
}
|
||||
return out[i].Candidate.EndpointID < out[j].Candidate.EndpointID
|
||||
})
|
||||
return out
|
||||
}
|
||||
|
||||
func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCandidateScoreOptions) ScoredPeerEndpointCandidate {
|
||||
score := 100
|
||||
reasons := []string{"base"}
|
||||
|
||||
switch candidate.Transport {
|
||||
case "direct_tcp_tls":
|
||||
score += 35
|
||||
reasons = append(reasons, "transport:direct_tcp_tls")
|
||||
case "wss":
|
||||
score += 25
|
||||
reasons = append(reasons, "transport:wss")
|
||||
case "outbound_reverse":
|
||||
score += 10
|
||||
reasons = append(reasons, "transport:outbound_reverse")
|
||||
case "relay":
|
||||
score += 5
|
||||
reasons = append(reasons, "transport:relay")
|
||||
default:
|
||||
score -= 100
|
||||
reasons = append(reasons, "transport:unknown")
|
||||
}
|
||||
|
||||
switch candidate.Reachability {
|
||||
case "public":
|
||||
score += 30
|
||||
reasons = append(reasons, "reachability:public")
|
||||
case "private":
|
||||
score += 15
|
||||
reasons = append(reasons, "reachability:private")
|
||||
case "relay":
|
||||
score += 5
|
||||
reasons = append(reasons, "reachability:relay")
|
||||
case "outbound_only":
|
||||
score -= 5
|
||||
reasons = append(reasons, "reachability:outbound_only")
|
||||
default:
|
||||
score -= 15
|
||||
reasons = append(reasons, "reachability:unknown")
|
||||
}
|
||||
|
||||
switch candidate.ConnectivityMode {
|
||||
case "direct":
|
||||
score += 30
|
||||
reasons = append(reasons, "connectivity:direct")
|
||||
case "outbound_only":
|
||||
score += 5
|
||||
reasons = append(reasons, "connectivity:outbound_only")
|
||||
case "relay_required":
|
||||
score -= 5
|
||||
reasons = append(reasons, "connectivity:relay_required")
|
||||
default:
|
||||
score -= 10
|
||||
reasons = append(reasons, "connectivity:unknown")
|
||||
}
|
||||
|
||||
switch candidate.NATType {
|
||||
case "", "none":
|
||||
score += 15
|
||||
reasons = append(reasons, "nat:none")
|
||||
case "full_cone":
|
||||
score += 10
|
||||
reasons = append(reasons, "nat:full_cone")
|
||||
case "restricted", "port_restricted":
|
||||
score += 3
|
||||
reasons = append(reasons, "nat:restricted")
|
||||
case "symmetric":
|
||||
score -= 20
|
||||
reasons = append(reasons, "nat:symmetric")
|
||||
case "blocked":
|
||||
score -= 60
|
||||
reasons = append(reasons, "nat:blocked")
|
||||
default:
|
||||
score -= 8
|
||||
reasons = append(reasons, "nat:unknown")
|
||||
}
|
||||
|
||||
if candidate.Priority > 0 {
|
||||
score -= candidate.Priority
|
||||
reasons = append(reasons, "priority")
|
||||
}
|
||||
if opts.PreferredRegion != "" && candidate.Region != "" {
|
||||
if strings.EqualFold(candidate.Region, opts.PreferredRegion) {
|
||||
score += 12
|
||||
reasons = append(reasons, "region:preferred")
|
||||
} else {
|
||||
score -= 4
|
||||
reasons = append(reasons, "region:remote")
|
||||
}
|
||||
}
|
||||
if hasPolicyTag(candidate.PolicyTags, "fast-path") {
|
||||
score += 10
|
||||
reasons = append(reasons, "policy:fast-path")
|
||||
}
|
||||
if hasPolicyTag(candidate.PolicyTags, "private-lan") || hasPolicyTag(candidate.PolicyTags, "corp-lan") || hasPolicyTag(candidate.PolicyTags, "same-site") {
|
||||
score += 18
|
||||
reasons = append(reasons, "policy:private-lan")
|
||||
}
|
||||
if hasPolicyTag(candidate.PolicyTags, "costly") {
|
||||
score -= 10
|
||||
reasons = append(reasons, "policy:costly")
|
||||
}
|
||||
if opts.ChannelClass == SyntheticChannelFabricControl || opts.ChannelClass == SyntheticChannelRouteControl {
|
||||
if candidate.ConnectivityMode == "direct" {
|
||||
score += 8
|
||||
reasons = append(reasons, "channel:control-direct")
|
||||
}
|
||||
if candidate.Transport == "relay" {
|
||||
score -= 8
|
||||
reasons = append(reasons, "channel:control-relay-penalty")
|
||||
}
|
||||
}
|
||||
if !opts.Now.IsZero() && candidate.LastVerifiedAt != nil && opts.MaxVerificationAge > 0 {
|
||||
age := opts.Now.Sub(candidate.LastVerifiedAt.UTC())
|
||||
if age >= 0 && age <= opts.MaxVerificationAge {
|
||||
score += 8
|
||||
reasons = append(reasons, "verified:fresh")
|
||||
} else {
|
||||
score -= 12
|
||||
reasons = append(reasons, "verified:stale")
|
||||
}
|
||||
}
|
||||
if observation, ok := opts.Observations[candidate.EndpointID]; ok {
|
||||
observationScore, observationReasons := scoreEndpointCandidateObservation(observation, opts)
|
||||
score += observationScore
|
||||
reasons = append(reasons, observationReasons...)
|
||||
}
|
||||
|
||||
return ScoredPeerEndpointCandidate{
|
||||
Candidate: candidate,
|
||||
Score: score,
|
||||
Reasons: reasons,
|
||||
}
|
||||
}
|
||||
|
||||
func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObservation, opts EndpointCandidateScoreOptions) (int, []string) {
|
||||
score := 0
|
||||
reasons := []string{"observation:present"}
|
||||
if !opts.Now.IsZero() && !observation.ObservedAt.IsZero() && opts.MaxObservationAge > 0 {
|
||||
age := opts.Now.Sub(observation.ObservedAt.UTC())
|
||||
if age < 0 || age > opts.MaxObservationAge {
|
||||
return -12, []string{"observation:stale"}
|
||||
}
|
||||
score += 6
|
||||
reasons = append(reasons, "observation:fresh")
|
||||
}
|
||||
switch {
|
||||
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 50:
|
||||
score += 18
|
||||
reasons = append(reasons, "latency:low")
|
||||
case observation.LastLatencyMs <= 150:
|
||||
score += 8
|
||||
reasons = append(reasons, "latency:moderate")
|
||||
case observation.LastLatencyMs > 0:
|
||||
score -= 10
|
||||
reasons = append(reasons, "latency:high")
|
||||
}
|
||||
if observation.ReliabilityScore > 0 {
|
||||
switch {
|
||||
case observation.ReliabilityScore >= 90:
|
||||
score += 15
|
||||
reasons = append(reasons, "reliability:high")
|
||||
case observation.ReliabilityScore >= 70:
|
||||
score += 5
|
||||
reasons = append(reasons, "reliability:moderate")
|
||||
default:
|
||||
score -= 12
|
||||
reasons = append(reasons, "reliability:low")
|
||||
}
|
||||
}
|
||||
if observation.SuccessCount > 0 {
|
||||
score += boundedInt(int(observation.SuccessCount), 1, 10)
|
||||
reasons = append(reasons, "history:success")
|
||||
}
|
||||
if observation.FailureCount > 0 {
|
||||
score -= boundedInt(int(observation.FailureCount)*6, 6, 30)
|
||||
reasons = append(reasons, "history:failure")
|
||||
}
|
||||
if strings.TrimSpace(observation.LastFailureReason) != "" {
|
||||
score -= 8
|
||||
reasons = append(reasons, "failure:recent")
|
||||
}
|
||||
return score, reasons
|
||||
}
|
||||
|
||||
func hasPolicyTag(tags []string, needle string) bool {
|
||||
for _, tag := range tags {
|
||||
if strings.EqualFold(strings.TrimSpace(tag), needle) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func boundedInt(value, minValue, maxValue int) int {
|
||||
if value < minValue {
|
||||
return minValue
|
||||
}
|
||||
if value > maxValue {
|
||||
return maxValue
|
||||
}
|
||||
return value
|
||||
}
|
||||
@@ -0,0 +1,278 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestRankPeerEndpointCandidatesPrefersDirectFreshPublicPath(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
fresh := now.Add(-time.Minute)
|
||||
stale := now.Add(-2 * time.Hour)
|
||||
candidates := []PeerEndpointCandidate{
|
||||
{
|
||||
EndpointID: "node-b-relay",
|
||||
NodeID: "node-b",
|
||||
Transport: "relay",
|
||||
Address: "relay.example.test/node-b",
|
||||
Reachability: "relay",
|
||||
NATType: "symmetric",
|
||||
ConnectivityMode: "relay_required",
|
||||
Region: "us",
|
||||
Priority: 1,
|
||||
LastVerifiedAt: &fresh,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-public",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.20:443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
Region: "eu",
|
||||
Priority: 10,
|
||||
PolicyTags: []string{"fast-path"},
|
||||
LastVerifiedAt: &fresh,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-private-stale",
|
||||
NodeID: "node-b",
|
||||
Transport: "wss",
|
||||
Address: "10.0.0.5:443",
|
||||
Reachability: "private",
|
||||
NATType: "restricted",
|
||||
ConnectivityMode: "direct",
|
||||
Region: "eu",
|
||||
Priority: 5,
|
||||
LastVerifiedAt: &stale,
|
||||
},
|
||||
}
|
||||
|
||||
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
|
||||
ChannelClass: SyntheticChannelFabricControl,
|
||||
PreferredRegion: "eu",
|
||||
Now: now,
|
||||
MaxVerificationAge: time.Hour,
|
||||
})
|
||||
if len(ranked) != 3 {
|
||||
t.Fatalf("ranked length = %d, want 3", len(ranked))
|
||||
}
|
||||
if ranked[0].Candidate.EndpointID != "node-b-public" {
|
||||
t.Fatalf("top endpoint = %q, want node-b-public: %+v", ranked[0].Candidate.EndpointID, ranked)
|
||||
}
|
||||
if ranked[0].Score <= ranked[1].Score {
|
||||
t.Fatalf("top score = %d, second = %d", ranked[0].Score, ranked[1].Score)
|
||||
}
|
||||
if !containsReason(ranked[0].Reasons, "policy:fast-path") || !containsReason(ranked[0].Reasons, "verified:fresh") {
|
||||
t.Fatalf("top reasons missing expected hints: %+v", ranked[0].Reasons)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRankPeerEndpointCandidatesUsesDeterministicTieBreak(t *testing.T) {
|
||||
candidates := []PeerEndpointCandidate{
|
||||
{
|
||||
EndpointID: "endpoint-b",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.21:443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 10,
|
||||
},
|
||||
{
|
||||
EndpointID: "endpoint-a",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.20:443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 10,
|
||||
},
|
||||
}
|
||||
|
||||
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{})
|
||||
if ranked[0].Candidate.EndpointID != "endpoint-a" {
|
||||
t.Fatalf("tie top endpoint = %q, want endpoint-a", ranked[0].Candidate.EndpointID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRankPeerEndpointCandidatesPrefersCorporatePrivateEndpoint(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
candidates := []PeerEndpointCandidate{
|
||||
{
|
||||
EndpointID: "node-b-public",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.20:443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
Region: "corp-eu",
|
||||
Priority: 10,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-corp-lan",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "10.24.10.20:19001",
|
||||
Reachability: "private",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
Region: "corp-eu",
|
||||
Priority: 1,
|
||||
PolicyTags: []string{"corp-lan", "same-site"},
|
||||
},
|
||||
}
|
||||
|
||||
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
|
||||
ChannelClass: SyntheticChannelFabricControl,
|
||||
PreferredRegion: "corp-eu",
|
||||
Now: now,
|
||||
})
|
||||
if ranked[0].Candidate.EndpointID != "node-b-corp-lan" {
|
||||
t.Fatalf("top endpoint = %q, want node-b-corp-lan: %+v", ranked[0].Candidate.EndpointID, ranked)
|
||||
}
|
||||
if !containsReason(ranked[0].Reasons, "policy:private-lan") || !containsReason(ranked[0].Reasons, "region:preferred") {
|
||||
t.Fatalf("corp LAN reasons missing: %+v", ranked[0].Reasons)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRankPeerEndpointCandidatesDoesNotDropRelayRequiredFallback(t *testing.T) {
|
||||
candidates := []PeerEndpointCandidate{
|
||||
{
|
||||
EndpointID: "node-b-outbound",
|
||||
NodeID: "node-b",
|
||||
Transport: "outbound_reverse",
|
||||
Address: "node-b.reverse.local",
|
||||
Reachability: "outbound_only",
|
||||
NATType: "symmetric",
|
||||
ConnectivityMode: "outbound_only",
|
||||
Priority: 20,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-relay",
|
||||
NodeID: "node-b",
|
||||
Transport: "relay",
|
||||
Address: "relay.example.test/node-b",
|
||||
Reachability: "relay",
|
||||
NATType: "blocked",
|
||||
ConnectivityMode: "relay_required",
|
||||
Priority: 30,
|
||||
},
|
||||
}
|
||||
|
||||
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
|
||||
ChannelClass: SyntheticChannelRouteControl,
|
||||
})
|
||||
if len(ranked) != 2 {
|
||||
t.Fatalf("ranked length = %d, want 2", len(ranked))
|
||||
}
|
||||
for _, item := range ranked {
|
||||
if item.Candidate.EndpointID == "" {
|
||||
t.Fatalf("ranked candidate lost identity: %+v", item)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRankPeerEndpointCandidatesUsesHealthObservationOverlay(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 13, 0, 0, 0, time.UTC)
|
||||
candidates := []PeerEndpointCandidate{
|
||||
{
|
||||
EndpointID: "node-b-direct",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.20:443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 10,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-wss",
|
||||
NodeID: "node-b",
|
||||
Transport: "wss",
|
||||
Address: "node-b.example.test",
|
||||
Reachability: "public",
|
||||
NATType: "restricted",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 10,
|
||||
},
|
||||
}
|
||||
|
||||
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
|
||||
Now: now,
|
||||
MaxObservationAge: 5 * time.Minute,
|
||||
Observations: map[string]EndpointCandidateHealthObservation{
|
||||
"node-b-direct": {
|
||||
EndpointID: "node-b-direct",
|
||||
LastLatencyMs: 240,
|
||||
FailureCount: 3,
|
||||
LastFailureReason: "connect_timeout",
|
||||
ReliabilityScore: 50,
|
||||
ObservedAt: now.Add(-time.Minute),
|
||||
},
|
||||
"node-b-wss": {
|
||||
EndpointID: "node-b-wss",
|
||||
LastLatencyMs: 35,
|
||||
SuccessCount: 8,
|
||||
ReliabilityScore: 95,
|
||||
ObservedAt: now.Add(-time.Minute),
|
||||
},
|
||||
},
|
||||
})
|
||||
if ranked[0].Candidate.EndpointID != "node-b-wss" {
|
||||
t.Fatalf("top endpoint = %q, want node-b-wss: %+v", ranked[0].Candidate.EndpointID, ranked)
|
||||
}
|
||||
if !containsReason(ranked[0].Reasons, "latency:low") || !containsReason(ranked[0].Reasons, "reliability:high") {
|
||||
t.Fatalf("top reasons missing health hints: %+v", ranked[0].Reasons)
|
||||
}
|
||||
if !containsReason(ranked[1].Reasons, "history:failure") || !containsReason(ranked[1].Reasons, "failure:recent") {
|
||||
t.Fatalf("failed endpoint reasons missing failure hints: %+v", ranked[1].Reasons)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRankPeerEndpointCandidatesTreatsStaleObservationAsPenalty(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 13, 0, 0, 0, time.UTC)
|
||||
candidates := []PeerEndpointCandidate{
|
||||
{
|
||||
EndpointID: "node-b-direct",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.20:443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 10,
|
||||
},
|
||||
}
|
||||
|
||||
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
|
||||
Now: now,
|
||||
MaxObservationAge: 5 * time.Minute,
|
||||
Observations: map[string]EndpointCandidateHealthObservation{
|
||||
"node-b-direct": {
|
||||
EndpointID: "node-b-direct",
|
||||
LastLatencyMs: 20,
|
||||
ObservedAt: now.Add(-time.Hour),
|
||||
},
|
||||
},
|
||||
})
|
||||
if !containsReason(ranked[0].Reasons, "observation:stale") {
|
||||
t.Fatalf("reasons missing stale observation: %+v", ranked[0].Reasons)
|
||||
}
|
||||
if containsReason(ranked[0].Reasons, "latency:low") {
|
||||
t.Fatalf("stale observation should not contribute latency: %+v", ranked[0].Reasons)
|
||||
}
|
||||
}
|
||||
|
||||
func containsReason(reasons []string, reason string) bool {
|
||||
for _, item := range reasons {
|
||||
if item == reason {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// HTTPPeerTransport sends synthetic mesh envelopes to explicitly configured
|
||||
// peer endpoints. It is intentionally narrow: production forwarding remains
|
||||
// disabled and only SyntheticRuntime messages use this transport.
|
||||
type HTTPPeerTransport struct {
|
||||
PeerURLs map[string]string
|
||||
HTTPClient *http.Client
|
||||
}
|
||||
|
||||
func NewHTTPPeerTransport(peerURLs map[string]string) *HTTPPeerTransport {
|
||||
normalized := make(map[string]string, len(peerURLs))
|
||||
for nodeID, baseURL := range peerURLs {
|
||||
nodeID = strings.TrimSpace(nodeID)
|
||||
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
|
||||
if nodeID != "" && baseURL != "" {
|
||||
normalized[nodeID] = baseURL
|
||||
}
|
||||
}
|
||||
return &HTTPPeerTransport{PeerURLs: normalized}
|
||||
}
|
||||
|
||||
func (t *HTTPPeerTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
|
||||
if t == nil {
|
||||
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
|
||||
}
|
||||
baseURL := strings.TrimRight(strings.TrimSpace(t.PeerURLs[nextNodeID]), "/")
|
||||
if baseURL == "" {
|
||||
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
|
||||
}
|
||||
client := NewClient(baseURL)
|
||||
if t.HTTPClient != nil {
|
||||
client.HTTPClient = t.HTTPClient
|
||||
}
|
||||
return client.SendSynthetic(ctx, envelope)
|
||||
}
|
||||
@@ -0,0 +1,130 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestHTTPPeerTransportDirectSyntheticProbe(t *testing.T) {
|
||||
nodeA := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
defer nodeA.Close()
|
||||
nodeB := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"})
|
||||
defer nodeB.Close()
|
||||
|
||||
route := liveSyntheticRoute("route-direct", []string{"node-a", "node-b"})
|
||||
routes := []SyntheticRoute{route}
|
||||
nodeA.Runtime = newLiveRuntime(nodeA.Local, routes, map[string]string{"node-b": nodeB.URL})
|
||||
nodeB.Runtime = newLiveRuntime(nodeB.Local, routes, map[string]string{})
|
||||
|
||||
ack, err := nodeA.Runtime.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-live-direct")
|
||||
if err != nil {
|
||||
t.Fatalf("send live direct probe: %v", err)
|
||||
}
|
||||
if ack.MessageType != SyntheticMessageProbeAck {
|
||||
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
|
||||
}
|
||||
payload := decodeAckPayload(t, ack)
|
||||
if got, want := payload.Path, []string{"node-a", "node-b"}; !sameStrings(got, want) {
|
||||
t.Fatalf("path = %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHTTPPeerTransportSingleRelaySyntheticProbe(t *testing.T) {
|
||||
nodeA := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
defer nodeA.Close()
|
||||
nodeR := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"})
|
||||
defer nodeR.Close()
|
||||
nodeB := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"})
|
||||
defer nodeB.Close()
|
||||
|
||||
route := liveSyntheticRoute("route-relay", []string{"node-a", "node-r", "node-b"})
|
||||
routes := []SyntheticRoute{route}
|
||||
nodeA.Runtime = newLiveRuntime(nodeA.Local, routes, map[string]string{"node-r": nodeR.URL})
|
||||
nodeR.Runtime = newLiveRuntime(nodeR.Local, routes, map[string]string{"node-b": nodeB.URL})
|
||||
nodeB.Runtime = newLiveRuntime(nodeB.Local, routes, map[string]string{})
|
||||
|
||||
ack, err := nodeA.Runtime.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-live-relay")
|
||||
if err != nil {
|
||||
t.Fatalf("send live relay probe: %v", err)
|
||||
}
|
||||
if ack.MessageType != SyntheticMessageProbeAck {
|
||||
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
|
||||
}
|
||||
payload := decodeAckPayload(t, ack)
|
||||
if got, want := payload.Path, []string{"node-a", "node-r", "node-b"}; !sameStrings(got, want) {
|
||||
t.Fatalf("path = %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHTTPPeerTransportMissingPeer(t *testing.T) {
|
||||
transport := NewHTTPPeerTransport(map[string]string{})
|
||||
_, err := transport.SendSynthetic(context.Background(), "node-missing", SyntheticEnvelope{})
|
||||
if !errors.Is(err, ErrSyntheticPeerUnavailable) {
|
||||
t.Fatalf("err = %v, want ErrSyntheticPeerUnavailable", err)
|
||||
}
|
||||
}
|
||||
|
||||
type liveSyntheticNode struct {
|
||||
Local PeerIdentity
|
||||
Runtime *SyntheticRuntime
|
||||
URL string
|
||||
server *httptest.Server
|
||||
}
|
||||
|
||||
func newLiveSyntheticNode(t *testing.T, local PeerIdentity) *liveSyntheticNode {
|
||||
t.Helper()
|
||||
node := &liveSyntheticNode{Local: local}
|
||||
node.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
Server{Local: node.Local, SyntheticRuntime: node.Runtime}.Handler().ServeHTTP(w, r)
|
||||
}))
|
||||
node.URL = node.server.URL
|
||||
return node
|
||||
}
|
||||
|
||||
func (n *liveSyntheticNode) Close() {
|
||||
if n.server != nil {
|
||||
n.server.Close()
|
||||
}
|
||||
}
|
||||
|
||||
func newLiveRuntime(local PeerIdentity, routes []SyntheticRoute, peers map[string]string) *SyntheticRuntime {
|
||||
return NewSyntheticRuntime(SyntheticRuntimeConfig{
|
||||
Enabled: true,
|
||||
Local: local,
|
||||
Routes: routes,
|
||||
Transport: NewHTTPPeerTransport(peers),
|
||||
})
|
||||
}
|
||||
|
||||
func liveSyntheticRoute(routeID string, hops []string) SyntheticRoute {
|
||||
return SyntheticRoute{
|
||||
RouteID: routeID,
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: hops[0],
|
||||
DestinationNodeID: hops[len(hops)-1],
|
||||
Hops: hops,
|
||||
AllowedChannels: []string{SyntheticChannelFabricControl},
|
||||
MaxTTL: 8,
|
||||
MaxHops: 8,
|
||||
ExpiresAt: time.Now().UTC().Add(time.Hour),
|
||||
RouteVersion: "route-v1",
|
||||
PolicyVersion: "policy-v1",
|
||||
PeerDirectoryVersion: "peers-v1",
|
||||
}
|
||||
}
|
||||
|
||||
func sameStrings(left, right []string) bool {
|
||||
if len(left) != len(right) {
|
||||
return false
|
||||
}
|
||||
for i := range left {
|
||||
if left[i] != right[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
@@ -0,0 +1,374 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const DefaultWarmPeerLimit = 8
|
||||
|
||||
type PeerCacheConfig struct {
|
||||
Local PeerIdentity
|
||||
PeerEndpoints map[string]string
|
||||
PeerEndpointCandidates map[string][]PeerEndpointCandidate
|
||||
PeerDirectory []PeerDirectoryEntry
|
||||
RecoverySeeds []PeerRecoverySeed
|
||||
RendezvousLeases []PeerRendezvousLease
|
||||
Routes []SyntheticRoute
|
||||
WarmPeerLimit int
|
||||
PreferredRegion string
|
||||
Now time.Time
|
||||
}
|
||||
|
||||
type PeerCache struct {
|
||||
snapshot PeerCacheSnapshot
|
||||
}
|
||||
|
||||
type PeerCacheSnapshot struct {
|
||||
ClusterID string `json:"cluster_id"`
|
||||
LocalNodeID string `json:"local_node_id"`
|
||||
PeerCount int `json:"peer_count"`
|
||||
WarmPeerCount int `json:"warm_peer_count"`
|
||||
RecoverySeedCount int `json:"recovery_seed_count"`
|
||||
RendezvousLeaseCount int `json:"rendezvous_lease_count"`
|
||||
BuiltAt time.Time `json:"built_at"`
|
||||
Entries []PeerCacheEntry `json:"entries"`
|
||||
}
|
||||
|
||||
type PeerCacheEntry struct {
|
||||
NodeID string `json:"node_id"`
|
||||
RouteIDs []string `json:"route_ids,omitempty"`
|
||||
Endpoint string `json:"endpoint,omitempty"`
|
||||
EndpointCount int `json:"endpoint_count"`
|
||||
CandidateCount int `json:"candidate_count"`
|
||||
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
|
||||
RecoverySeed bool `json:"recovery_seed"`
|
||||
Warm bool `json:"warm"`
|
||||
WarmReason string `json:"warm_reason,omitempty"`
|
||||
BestCandidateID string `json:"best_candidate_id,omitempty"`
|
||||
BestCandidateAddr string `json:"best_candidate_addr,omitempty"`
|
||||
BestTransport string `json:"best_transport,omitempty"`
|
||||
BestReachability string `json:"best_reachability,omitempty"`
|
||||
BestConnectivity string `json:"best_connectivity,omitempty"`
|
||||
BestNATType string `json:"best_nat_type,omitempty"`
|
||||
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
|
||||
BestCandidateScore int `json:"best_candidate_score,omitempty"`
|
||||
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
|
||||
RelayNodeID string `json:"relay_node_id,omitempty"`
|
||||
RelayEndpoint string `json:"relay_endpoint,omitempty"`
|
||||
RelayControl bool `json:"relay_control"`
|
||||
}
|
||||
|
||||
type peerCacheBuildEntry struct {
|
||||
PeerCacheEntry
|
||||
adjacentRoutePeer bool
|
||||
bestScore int
|
||||
}
|
||||
|
||||
func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
|
||||
now := cfg.Now.UTC()
|
||||
if now.IsZero() {
|
||||
now = time.Now().UTC()
|
||||
}
|
||||
limit := cfg.WarmPeerLimit
|
||||
if limit <= 0 {
|
||||
limit = DefaultWarmPeerLimit
|
||||
}
|
||||
entries := map[string]*peerCacheBuildEntry{}
|
||||
for _, item := range cfg.PeerDirectory {
|
||||
nodeID := strings.TrimSpace(item.NodeID)
|
||||
if nodeID == "" || nodeID == cfg.Local.NodeID {
|
||||
continue
|
||||
}
|
||||
entry := peerCacheEntry(entries, nodeID)
|
||||
entry.RouteIDs = mergeStrings(entry.RouteIDs, item.RouteIDs)
|
||||
entry.EndpointCount = maxInt(entry.EndpointCount, item.EndpointCount)
|
||||
entry.CandidateCount = maxInt(entry.CandidateCount, item.CandidateCount)
|
||||
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, item.ConnectivityModes)
|
||||
entry.RecoverySeed = entry.RecoverySeed || item.RecoverySeed
|
||||
}
|
||||
for nodeID, endpoint := range cfg.PeerEndpoints {
|
||||
nodeID = strings.TrimSpace(nodeID)
|
||||
endpoint = strings.TrimSpace(endpoint)
|
||||
if nodeID == "" || nodeID == cfg.Local.NodeID || endpoint == "" {
|
||||
continue
|
||||
}
|
||||
entry := peerCacheEntry(entries, nodeID)
|
||||
entry.Endpoint = endpoint
|
||||
entry.EndpointCount = maxInt(entry.EndpointCount, 1)
|
||||
}
|
||||
for nodeID, candidates := range cfg.PeerEndpointCandidates {
|
||||
nodeID = strings.TrimSpace(nodeID)
|
||||
if nodeID == "" || nodeID == cfg.Local.NodeID || len(candidates) == 0 {
|
||||
continue
|
||||
}
|
||||
entry := peerCacheEntry(entries, nodeID)
|
||||
entry.CandidateCount = maxInt(entry.CandidateCount, len(candidates))
|
||||
for _, candidate := range candidates {
|
||||
if strings.TrimSpace(candidate.ConnectivityMode) != "" {
|
||||
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{candidate.ConnectivityMode})
|
||||
}
|
||||
}
|
||||
scored := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
|
||||
ChannelClass: SyntheticChannelFabricControl,
|
||||
PreferredRegion: cfg.PreferredRegion,
|
||||
Now: now,
|
||||
MaxVerificationAge: time.Hour,
|
||||
})
|
||||
if len(scored) > 0 {
|
||||
entry.BestCandidateID = scored[0].Candidate.EndpointID
|
||||
entry.BestCandidateAddr = scored[0].Candidate.Address
|
||||
entry.BestTransport = scored[0].Candidate.Transport
|
||||
entry.BestReachability = scored[0].Candidate.Reachability
|
||||
entry.BestConnectivity = scored[0].Candidate.ConnectivityMode
|
||||
entry.BestNATType = scored[0].Candidate.NATType
|
||||
entry.BestPolicyTags = append([]string{}, scored[0].Candidate.PolicyTags...)
|
||||
entry.BestCandidateScore = scored[0].Score
|
||||
entry.bestScore = scored[0].Score
|
||||
if strings.TrimSpace(scored[0].Candidate.Address) != "" {
|
||||
entry.Endpoint = strings.TrimSpace(scored[0].Candidate.Address)
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, route := range cfg.Routes {
|
||||
path := routePath(route)
|
||||
localIndex := indexOf(path, cfg.Local.NodeID)
|
||||
if localIndex < 0 {
|
||||
continue
|
||||
}
|
||||
for _, nodeID := range path {
|
||||
if nodeID == "" || nodeID == cfg.Local.NodeID {
|
||||
continue
|
||||
}
|
||||
entry := peerCacheEntry(entries, nodeID)
|
||||
entry.RouteIDs = mergeStrings(entry.RouteIDs, []string{route.RouteID})
|
||||
}
|
||||
for _, adjacentIndex := range []int{localIndex - 1, localIndex + 1} {
|
||||
if adjacentIndex < 0 || adjacentIndex >= len(path) {
|
||||
continue
|
||||
}
|
||||
nodeID := path[adjacentIndex]
|
||||
if nodeID == "" || nodeID == cfg.Local.NodeID {
|
||||
continue
|
||||
}
|
||||
peerCacheEntry(entries, nodeID).adjacentRoutePeer = true
|
||||
}
|
||||
}
|
||||
for _, seed := range cfg.RecoverySeeds {
|
||||
nodeID := strings.TrimSpace(seed.NodeID)
|
||||
if nodeID == "" || nodeID == cfg.Local.NodeID {
|
||||
continue
|
||||
}
|
||||
entry := peerCacheEntry(entries, nodeID)
|
||||
entry.RecoverySeed = true
|
||||
if entry.Endpoint == "" {
|
||||
entry.Endpoint = strings.TrimSpace(seed.Endpoint)
|
||||
}
|
||||
if strings.TrimSpace(seed.ConnectivityMode) != "" {
|
||||
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{seed.ConnectivityMode})
|
||||
}
|
||||
}
|
||||
rendezvousLeases := 0
|
||||
for _, lease := range cfg.RendezvousLeases {
|
||||
if !leaseUsableForPeerCache(lease, cfg.Local.NodeID, now) {
|
||||
continue
|
||||
}
|
||||
rendezvousLeases++
|
||||
if lease.PeerNodeID != cfg.Local.NodeID {
|
||||
entry := peerCacheEntry(entries, lease.PeerNodeID)
|
||||
useLeaseEndpoint := shouldUseRendezvousEndpoint(*entry)
|
||||
entry.RendezvousLeaseID = lease.LeaseID
|
||||
entry.RelayNodeID = lease.RelayNodeID
|
||||
entry.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
|
||||
entry.RelayControl = true
|
||||
entry.CandidateCount = maxInt(entry.CandidateCount, 1)
|
||||
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{firstNonEmpty(lease.ConnectivityMode, "relay_required"), "relay_control"})
|
||||
if useLeaseEndpoint {
|
||||
entry.BestTransport = firstNonEmpty(lease.Transport, "relay_control")
|
||||
entry.BestReachability = "relay"
|
||||
entry.BestConnectivity = firstNonEmpty(lease.ConnectivityMode, "relay_required")
|
||||
entry.Endpoint = entry.RelayEndpoint
|
||||
entry.BestCandidateID = lease.LeaseID
|
||||
entry.BestCandidateAddr = entry.RelayEndpoint
|
||||
entry.bestScore = maxInt(entry.bestScore, 500)
|
||||
}
|
||||
}
|
||||
if lease.PeerNodeID == cfg.Local.NodeID && lease.RelayNodeID != "" && lease.RelayNodeID != cfg.Local.NodeID {
|
||||
entry := peerCacheEntry(entries, lease.RelayNodeID)
|
||||
if entry.Endpoint == "" {
|
||||
entry.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
|
||||
}
|
||||
entry.EndpointCount = maxInt(entry.EndpointCount, 1)
|
||||
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{"relay_control"})
|
||||
}
|
||||
}
|
||||
out := make([]peerCacheBuildEntry, 0, len(entries))
|
||||
recoverySeeds := 0
|
||||
for _, entry := range entries {
|
||||
sort.Strings(entry.RouteIDs)
|
||||
sort.Strings(entry.ConnectivityModes)
|
||||
if entry.RecoverySeed {
|
||||
recoverySeeds++
|
||||
}
|
||||
out = append(out, *entry)
|
||||
}
|
||||
sort.SliceStable(out, func(i, j int) bool {
|
||||
left := warmPeerPriority(out[i])
|
||||
right := warmPeerPriority(out[j])
|
||||
if left != right {
|
||||
return left > right
|
||||
}
|
||||
return out[i].NodeID < out[j].NodeID
|
||||
})
|
||||
warm := 0
|
||||
for i := range out {
|
||||
if warm >= limit {
|
||||
break
|
||||
}
|
||||
if warmPeerPriority(out[i]) <= 0 {
|
||||
continue
|
||||
}
|
||||
out[i].Warm = true
|
||||
out[i].WarmReason = warmPeerReason(out[i])
|
||||
warm++
|
||||
}
|
||||
sort.SliceStable(out, func(i, j int) bool {
|
||||
return out[i].NodeID < out[j].NodeID
|
||||
})
|
||||
snapshotEntries := make([]PeerCacheEntry, 0, len(out))
|
||||
for _, entry := range out {
|
||||
snapshotEntries = append(snapshotEntries, entry.PeerCacheEntry)
|
||||
}
|
||||
return &PeerCache{snapshot: PeerCacheSnapshot{
|
||||
ClusterID: cfg.Local.ClusterID,
|
||||
LocalNodeID: cfg.Local.NodeID,
|
||||
PeerCount: len(snapshotEntries),
|
||||
WarmPeerCount: warm,
|
||||
RecoverySeedCount: recoverySeeds,
|
||||
RendezvousLeaseCount: rendezvousLeases,
|
||||
BuiltAt: now,
|
||||
Entries: snapshotEntries,
|
||||
}}
|
||||
}
|
||||
|
||||
func (c *PeerCache) Snapshot() PeerCacheSnapshot {
|
||||
if c == nil {
|
||||
return PeerCacheSnapshot{}
|
||||
}
|
||||
snapshot := c.snapshot
|
||||
snapshot.Entries = append([]PeerCacheEntry{}, c.snapshot.Entries...)
|
||||
return snapshot
|
||||
}
|
||||
|
||||
func (c *PeerCache) WarmPeerIDs() []string {
|
||||
snapshot := c.Snapshot()
|
||||
out := make([]string, 0, snapshot.WarmPeerCount)
|
||||
for _, entry := range snapshot.Entries {
|
||||
if entry.Warm {
|
||||
out = append(out, entry.NodeID)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func peerCacheEntry(entries map[string]*peerCacheBuildEntry, nodeID string) *peerCacheBuildEntry {
|
||||
if entry, ok := entries[nodeID]; ok {
|
||||
return entry
|
||||
}
|
||||
entry := &peerCacheBuildEntry{PeerCacheEntry: PeerCacheEntry{NodeID: nodeID}}
|
||||
entries[nodeID] = entry
|
||||
return entry
|
||||
}
|
||||
|
||||
func warmPeerPriority(entry peerCacheBuildEntry) int {
|
||||
score := 0
|
||||
if entry.adjacentRoutePeer {
|
||||
score += 1000
|
||||
}
|
||||
if entry.RecoverySeed {
|
||||
score += 500
|
||||
}
|
||||
if entry.Endpoint != "" {
|
||||
score += 100
|
||||
}
|
||||
if entry.bestScore > 0 {
|
||||
score += entry.bestScore
|
||||
}
|
||||
if entry.RelayControl {
|
||||
score += 300
|
||||
}
|
||||
score += entry.CandidateCount
|
||||
return score
|
||||
}
|
||||
|
||||
func warmPeerReason(entry peerCacheBuildEntry) string {
|
||||
if entry.adjacentRoutePeer {
|
||||
return "route_adjacent"
|
||||
}
|
||||
if entry.RecoverySeed {
|
||||
return "recovery_seed"
|
||||
}
|
||||
if entry.RelayControl {
|
||||
return "rendezvous_lease"
|
||||
}
|
||||
if entry.BestCandidateID != "" {
|
||||
return "endpoint_candidate"
|
||||
}
|
||||
if entry.Endpoint != "" {
|
||||
return "peer_endpoint"
|
||||
}
|
||||
return "scoped_peer"
|
||||
}
|
||||
|
||||
func leaseUsableForPeerCache(lease PeerRendezvousLease, localNodeID string, now time.Time) bool {
|
||||
if strings.TrimSpace(lease.LeaseID) == "" ||
|
||||
strings.TrimSpace(lease.PeerNodeID) == "" ||
|
||||
strings.TrimSpace(lease.RelayNodeID) == "" ||
|
||||
strings.TrimSpace(lease.RelayEndpoint) == "" ||
|
||||
lease.ExpiresAt.IsZero() ||
|
||||
!lease.ExpiresAt.After(now) ||
|
||||
!lease.ControlPlaneOnly {
|
||||
return false
|
||||
}
|
||||
return lease.PeerNodeID != localNodeID || lease.RelayNodeID != localNodeID
|
||||
}
|
||||
|
||||
func shouldUseRendezvousEndpoint(entry peerCacheBuildEntry) bool {
|
||||
if strings.TrimSpace(entry.Endpoint) == "" {
|
||||
return true
|
||||
}
|
||||
transport := strings.ToLower(strings.TrimSpace(entry.BestTransport))
|
||||
reachability := strings.ToLower(strings.TrimSpace(entry.BestReachability))
|
||||
connectivity := strings.ToLower(strings.TrimSpace(entry.BestConnectivity))
|
||||
return strings.Contains(transport, "relay") ||
|
||||
strings.Contains(transport, "outbound") ||
|
||||
reachability == "relay" ||
|
||||
reachability == "outbound_only" ||
|
||||
connectivity == "relay_required" ||
|
||||
connectivity == "outbound_only"
|
||||
}
|
||||
|
||||
func mergeStrings(existing []string, incoming []string) []string {
|
||||
seen := map[string]struct{}{}
|
||||
out := make([]string, 0, len(existing)+len(incoming))
|
||||
for _, value := range append(existing, incoming...) {
|
||||
value = strings.TrimSpace(value)
|
||||
if value == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[value]; ok {
|
||||
continue
|
||||
}
|
||||
seen[value] = struct{}{}
|
||||
out = append(out, value)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func maxInt(left, right int) int {
|
||||
if left > right {
|
||||
return left
|
||||
}
|
||||
return right
|
||||
}
|
||||
@@ -0,0 +1,170 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestPeerCacheSelectsAdjacentWarmPeersWithinLimit(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
Local: local,
|
||||
PeerEndpoints: map[string]string{
|
||||
"node-a": "http://node-a:19000",
|
||||
"node-r": "http://node-r:19000",
|
||||
"node-c": "http://node-c:19000",
|
||||
},
|
||||
Routes: []SyntheticRoute{
|
||||
peerCacheRoute("route-1", []string{"node-a", local.NodeID, "node-r", "node-c"}),
|
||||
},
|
||||
RecoverySeeds: []PeerRecoverySeed{
|
||||
{NodeID: "node-seed", Endpoint: "https://seed.example.test", Transport: "direct_tcp_tls", Priority: 10},
|
||||
},
|
||||
WarmPeerLimit: 2,
|
||||
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
|
||||
})
|
||||
|
||||
warm := cache.WarmPeerIDs()
|
||||
if len(warm) != 2 || warm[0] != "node-a" || warm[1] != "node-r" {
|
||||
t.Fatalf("warm peers = %+v, want adjacent node-a/node-r", warm)
|
||||
}
|
||||
snapshot := cache.Snapshot()
|
||||
if snapshot.PeerCount != 4 || snapshot.RecoverySeedCount != 1 {
|
||||
t.Fatalf("unexpected snapshot counts: %+v", snapshot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerCachePromotesRecoverySeedAfterRoutePeers(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
Local: local,
|
||||
Routes: []SyntheticRoute{
|
||||
peerCacheRoute("route-1", []string{"node-a", local.NodeID, "node-r"}),
|
||||
},
|
||||
RecoverySeeds: []PeerRecoverySeed{
|
||||
{NodeID: "node-seed", Endpoint: "wss://seed.example.test/mesh", Transport: "wss", ConnectivityMode: "direct", Priority: 1},
|
||||
},
|
||||
WarmPeerLimit: 3,
|
||||
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
|
||||
})
|
||||
|
||||
warm := cache.WarmPeerIDs()
|
||||
if len(warm) != 3 || warm[0] != "node-a" || warm[1] != "node-r" || warm[2] != "node-seed" {
|
||||
t.Fatalf("warm peers = %+v, want adjacent peers then seed", warm)
|
||||
}
|
||||
seed, ok := peerCacheEntryByID(cache.Snapshot(), "node-seed")
|
||||
if !ok || !seed.RecoverySeed || seed.WarmReason != "recovery_seed" {
|
||||
t.Fatalf("unexpected seed entry: %+v", seed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
Local: local,
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
{
|
||||
EndpointID: "node-b-relay",
|
||||
NodeID: "node-b",
|
||||
Transport: "relay",
|
||||
Address: "relay.example.test",
|
||||
Reachability: "relay",
|
||||
ConnectivityMode: "relay_required",
|
||||
Priority: 20,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-public",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.20:443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 1,
|
||||
LastVerifiedAt: &now,
|
||||
},
|
||||
},
|
||||
},
|
||||
WarmPeerLimit: 1,
|
||||
Now: now,
|
||||
})
|
||||
|
||||
entry, ok := peerCacheEntryByID(cache.Snapshot(), "node-b")
|
||||
if !ok {
|
||||
t.Fatal("node-b missing from cache")
|
||||
}
|
||||
if entry.BestCandidateID != "node-b-public" || !entry.Warm {
|
||||
t.Fatalf("unexpected candidate selection: %+v", entry)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
Local: local,
|
||||
PeerEndpoints: map[string]string{
|
||||
"node-b": "https://node-b.public.example.test:443",
|
||||
},
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
{
|
||||
EndpointID: "node-b-public",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "https://node-b.public.example.test:443",
|
||||
Reachability: "public",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
Region: "corp-eu",
|
||||
Priority: 10,
|
||||
},
|
||||
{
|
||||
EndpointID: "node-b-corp-lan",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "http://10.24.10.20:19001",
|
||||
Reachability: "private",
|
||||
NATType: "none",
|
||||
ConnectivityMode: "direct",
|
||||
Region: "corp-eu",
|
||||
Priority: 1,
|
||||
PolicyTags: []string{"corp-lan"},
|
||||
},
|
||||
},
|
||||
},
|
||||
PreferredRegion: "corp-eu",
|
||||
WarmPeerLimit: 1,
|
||||
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
|
||||
})
|
||||
|
||||
entry, ok := peerCacheEntryByID(cache.Snapshot(), "node-b")
|
||||
if !ok {
|
||||
t.Fatal("node-b missing from peer cache")
|
||||
}
|
||||
if entry.BestCandidateID != "node-b-corp-lan" || entry.Endpoint != "http://10.24.10.20:19001" {
|
||||
t.Fatalf("peer cache did not choose corp LAN endpoint: %+v", entry)
|
||||
}
|
||||
}
|
||||
|
||||
func peerCacheRoute(routeID string, hops []string) SyntheticRoute {
|
||||
return SyntheticRoute{
|
||||
RouteID: routeID,
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: hops[0],
|
||||
DestinationNodeID: hops[len(hops)-1],
|
||||
Hops: append([]string{}, hops...),
|
||||
AllowedChannels: []string{SyntheticChannelFabricControl},
|
||||
ExpiresAt: time.Now().UTC().Add(time.Hour),
|
||||
}
|
||||
}
|
||||
|
||||
func peerCacheEntryByID(snapshot PeerCacheSnapshot, nodeID string) (PeerCacheEntry, bool) {
|
||||
for _, entry := range snapshot.Entries {
|
||||
if entry.NodeID == nodeID {
|
||||
return entry, true
|
||||
}
|
||||
}
|
||||
return PeerCacheEntry{}, false
|
||||
}
|
||||
@@ -0,0 +1,303 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"net"
|
||||
"net/netip"
|
||||
"net/url"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
PeerConnectionIntentMaintain = "maintain"
|
||||
PeerConnectionIntentProbe = "probe"
|
||||
PeerConnectionIntentRecover = "recover"
|
||||
)
|
||||
|
||||
const (
|
||||
PeerTransportModeDirect = "direct"
|
||||
PeerTransportModePrivateLAN = "private_lan"
|
||||
PeerTransportModeCorporateLAN = "corporate_lan"
|
||||
PeerTransportModeOutboundOnly = "outbound_only"
|
||||
PeerTransportModeRelayRequired = "relay_required"
|
||||
PeerTransportModeRelayControl = "relay_control"
|
||||
PeerTransportModeUnknown = "unknown"
|
||||
)
|
||||
|
||||
type PeerConnectionIntentPlanConfig struct {
|
||||
PeerCache PeerCacheSnapshot
|
||||
RecoveryPlan PeerRecoveryPlan
|
||||
RendezvousLeases []PeerRendezvousLease
|
||||
Now time.Time
|
||||
}
|
||||
|
||||
type PeerConnectionIntentPlan struct {
|
||||
Mode string `json:"mode"`
|
||||
IntentCount int `json:"intent_count"`
|
||||
MaintainCount int `json:"maintain_count"`
|
||||
ProbeCount int `json:"probe_count"`
|
||||
RecoverCount int `json:"recover_count"`
|
||||
DirectCount int `json:"direct_count"`
|
||||
PrivateLANCount int `json:"private_lan_count"`
|
||||
CorporateLANCount int `json:"corporate_lan_count"`
|
||||
OutboundOnlyCount int `json:"outbound_only_count"`
|
||||
RelayRequiredCount int `json:"relay_required_count"`
|
||||
RelayControlCount int `json:"relay_control_count"`
|
||||
RendezvousRequiredCount int `json:"rendezvous_required_count"`
|
||||
RendezvousResolvedCount int `json:"rendezvous_resolved_count"`
|
||||
RendezvousLeaseCount int `json:"rendezvous_lease_count"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Intents []PeerConnectionIntent `json:"intents,omitempty"`
|
||||
}
|
||||
|
||||
type PeerConnectionIntent struct {
|
||||
NodeID string `json:"node_id"`
|
||||
Action string `json:"action"`
|
||||
Reason string `json:"reason"`
|
||||
Endpoint string `json:"endpoint,omitempty"`
|
||||
ConnectionState string `json:"connection_state"`
|
||||
Transport string `json:"transport,omitempty"`
|
||||
TransportMode string `json:"transport_mode"`
|
||||
Reachability string `json:"reachability,omitempty"`
|
||||
ConnectivityMode string `json:"connectivity_mode,omitempty"`
|
||||
NATType string `json:"nat_type,omitempty"`
|
||||
PolicyTags []string `json:"policy_tags,omitempty"`
|
||||
RequiresRendezvous bool `json:"requires_rendezvous"`
|
||||
RendezvousResolved bool `json:"rendezvous_resolved"`
|
||||
DirectCandidate bool `json:"direct_candidate"`
|
||||
RelayCandidate bool `json:"relay_candidate"`
|
||||
BestCandidateID string `json:"best_candidate_id,omitempty"`
|
||||
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
|
||||
RelayNodeID string `json:"relay_node_id,omitempty"`
|
||||
RelayEndpoint string `json:"relay_endpoint,omitempty"`
|
||||
ControlPlaneOnly bool `json:"control_plane_only"`
|
||||
RecoverySeed bool `json:"recovery_seed"`
|
||||
Priority int `json:"priority"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
}
|
||||
|
||||
func PlanPeerConnectionIntents(cfg PeerConnectionIntentPlanConfig) PeerConnectionIntentPlan {
|
||||
now := normalizedNow(cfg.Now)
|
||||
entryByNode := map[string]PeerCacheEntry{}
|
||||
for _, entry := range cfg.PeerCache.Entries {
|
||||
if strings.TrimSpace(entry.NodeID) == "" {
|
||||
continue
|
||||
}
|
||||
entryByNode[entry.NodeID] = entry
|
||||
}
|
||||
|
||||
intents := make([]PeerConnectionIntent, 0, len(cfg.RecoveryPlan.Candidates))
|
||||
for _, candidate := range cfg.RecoveryPlan.Candidates {
|
||||
if strings.TrimSpace(candidate.NodeID) == "" {
|
||||
continue
|
||||
}
|
||||
entry := entryByNode[candidate.NodeID]
|
||||
intent := PeerConnectionIntent{
|
||||
NodeID: candidate.NodeID,
|
||||
Action: connectionIntentAction(candidate),
|
||||
Reason: candidate.Reason,
|
||||
Endpoint: candidate.Endpoint,
|
||||
ConnectionState: candidate.ConnectionState,
|
||||
Transport: firstNonEmpty(candidate.BestTransport, entry.BestTransport),
|
||||
Reachability: entry.BestReachability,
|
||||
ConnectivityMode: entry.BestConnectivity,
|
||||
NATType: entry.BestNATType,
|
||||
PolicyTags: append([]string{}, entry.BestPolicyTags...),
|
||||
BestCandidateID: firstNonEmpty(candidate.BestCandidateID, entry.BestCandidateID),
|
||||
RendezvousLeaseID: entry.RendezvousLeaseID,
|
||||
RelayNodeID: entry.RelayNodeID,
|
||||
RelayEndpoint: entry.RelayEndpoint,
|
||||
RelayCandidate: entry.RelayControl,
|
||||
ControlPlaneOnly: entry.RelayControl,
|
||||
RecoverySeed: candidate.RecoverySeed || entry.RecoverySeed,
|
||||
Priority: candidate.Priority,
|
||||
GeneratedAt: now,
|
||||
}
|
||||
mode, requiresRendezvous, directCandidate := classifyPeerTransport(intent)
|
||||
intent.TransportMode = mode
|
||||
intent.RequiresRendezvous = requiresRendezvous
|
||||
intent.DirectCandidate = directCandidate
|
||||
if intent.RequiresRendezvous {
|
||||
if lease, ok := rendezvousLeaseForPeer(cfg.RendezvousLeases, intent.NodeID, now); ok {
|
||||
applyRendezvousLease(&intent, lease)
|
||||
}
|
||||
}
|
||||
intents = append(intents, intent)
|
||||
}
|
||||
sort.SliceStable(intents, func(i, j int) bool {
|
||||
if intents[i].Priority != intents[j].Priority {
|
||||
return intents[i].Priority > intents[j].Priority
|
||||
}
|
||||
return intents[i].NodeID < intents[j].NodeID
|
||||
})
|
||||
|
||||
plan := PeerConnectionIntentPlan{
|
||||
Mode: cfg.RecoveryPlan.Mode,
|
||||
IntentCount: len(intents),
|
||||
GeneratedAt: now,
|
||||
Intents: intents,
|
||||
}
|
||||
for _, intent := range intents {
|
||||
switch intent.Action {
|
||||
case PeerConnectionIntentMaintain:
|
||||
plan.MaintainCount++
|
||||
case PeerConnectionIntentProbe:
|
||||
plan.ProbeCount++
|
||||
case PeerConnectionIntentRecover:
|
||||
plan.RecoverCount++
|
||||
}
|
||||
switch intent.TransportMode {
|
||||
case PeerTransportModeDirect:
|
||||
plan.DirectCount++
|
||||
case PeerTransportModePrivateLAN:
|
||||
plan.PrivateLANCount++
|
||||
case PeerTransportModeCorporateLAN:
|
||||
plan.CorporateLANCount++
|
||||
case PeerTransportModeOutboundOnly:
|
||||
plan.OutboundOnlyCount++
|
||||
case PeerTransportModeRelayRequired:
|
||||
plan.RelayRequiredCount++
|
||||
case PeerTransportModeRelayControl:
|
||||
plan.RelayControlCount++
|
||||
}
|
||||
if intent.RequiresRendezvous {
|
||||
plan.RendezvousRequiredCount++
|
||||
}
|
||||
if intent.RendezvousResolved {
|
||||
plan.RendezvousResolvedCount++
|
||||
}
|
||||
if intent.RendezvousLeaseID != "" {
|
||||
plan.RendezvousLeaseCount++
|
||||
}
|
||||
}
|
||||
return plan
|
||||
}
|
||||
|
||||
func connectionIntentAction(candidate PeerRecoveryCandidate) string {
|
||||
switch candidate.Reason {
|
||||
case "maintain_ready":
|
||||
return PeerConnectionIntentMaintain
|
||||
case "recover_degraded", "recover_seed", "recover_warm", "recover_peer":
|
||||
return PeerConnectionIntentRecover
|
||||
default:
|
||||
return PeerConnectionIntentProbe
|
||||
}
|
||||
}
|
||||
|
||||
func classifyPeerTransport(intent PeerConnectionIntent) (string, bool, bool) {
|
||||
transport := strings.ToLower(strings.TrimSpace(intent.Transport))
|
||||
connectivity := strings.ToLower(strings.TrimSpace(intent.ConnectivityMode))
|
||||
reachability := strings.ToLower(strings.TrimSpace(intent.Reachability))
|
||||
tags := lowerStringSet(intent.PolicyTags)
|
||||
|
||||
if strings.Contains(transport, "relay") || connectivity == "relay_required" || reachability == "relay" {
|
||||
return PeerTransportModeRelayRequired, true, false
|
||||
}
|
||||
if connectivity == "outbound_only" || reachability == "outbound_only" {
|
||||
return PeerTransportModeOutboundOnly, true, false
|
||||
}
|
||||
if tags["corp-lan"] || tags["same-site"] {
|
||||
return PeerTransportModeCorporateLAN, false, true
|
||||
}
|
||||
if tags["private-lan"] || reachability == "private" || endpointHasPrivateHost(intent.Endpoint) {
|
||||
return PeerTransportModePrivateLAN, false, true
|
||||
}
|
||||
if strings.Contains(transport, "direct") || reachability == "public" || connectivity == "direct" {
|
||||
return PeerTransportModeDirect, false, true
|
||||
}
|
||||
return PeerTransportModeUnknown, false, false
|
||||
}
|
||||
|
||||
func rendezvousLeaseForPeer(leases []PeerRendezvousLease, peerNodeID string, now time.Time) (PeerRendezvousLease, bool) {
|
||||
now = normalizedNow(now)
|
||||
candidates := make([]PeerRendezvousLease, 0, len(leases))
|
||||
for _, lease := range leases {
|
||||
if strings.TrimSpace(lease.PeerNodeID) != peerNodeID ||
|
||||
strings.TrimSpace(lease.RelayEndpoint) == "" ||
|
||||
strings.TrimSpace(lease.RelayNodeID) == "" ||
|
||||
!lease.ControlPlaneOnly ||
|
||||
lease.ExpiresAt.IsZero() ||
|
||||
!lease.ExpiresAt.After(now) {
|
||||
continue
|
||||
}
|
||||
candidates = append(candidates, lease)
|
||||
}
|
||||
if len(candidates) == 0 {
|
||||
return PeerRendezvousLease{}, false
|
||||
}
|
||||
sort.SliceStable(candidates, func(i, j int) bool {
|
||||
leftPriority := candidates[i].Priority
|
||||
rightPriority := candidates[j].Priority
|
||||
if leftPriority <= 0 {
|
||||
leftPriority = 100
|
||||
}
|
||||
if rightPriority <= 0 {
|
||||
rightPriority = 100
|
||||
}
|
||||
if leftPriority != rightPriority {
|
||||
return leftPriority < rightPriority
|
||||
}
|
||||
if !candidates[i].ExpiresAt.Equal(candidates[j].ExpiresAt) {
|
||||
return candidates[i].ExpiresAt.After(candidates[j].ExpiresAt)
|
||||
}
|
||||
return candidates[i].LeaseID < candidates[j].LeaseID
|
||||
})
|
||||
return candidates[0], true
|
||||
}
|
||||
|
||||
func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLease) {
|
||||
intent.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
|
||||
intent.Transport = firstNonEmpty(lease.Transport, "relay_control")
|
||||
intent.TransportMode = PeerTransportModeRelayControl
|
||||
intent.RequiresRendezvous = false
|
||||
intent.RendezvousResolved = true
|
||||
intent.DirectCandidate = false
|
||||
intent.RelayCandidate = true
|
||||
intent.RendezvousLeaseID = lease.LeaseID
|
||||
intent.RelayNodeID = lease.RelayNodeID
|
||||
intent.RelayEndpoint = intent.Endpoint
|
||||
intent.ControlPlaneOnly = true
|
||||
if lease.ConnectivityMode != "" {
|
||||
intent.ConnectivityMode = lease.ConnectivityMode
|
||||
}
|
||||
}
|
||||
|
||||
func endpointHasPrivateHost(rawEndpoint string) bool {
|
||||
rawEndpoint = strings.TrimSpace(rawEndpoint)
|
||||
if rawEndpoint == "" {
|
||||
return false
|
||||
}
|
||||
host := rawEndpoint
|
||||
if parsed, err := url.Parse(rawEndpoint); err == nil && parsed.Host != "" {
|
||||
host = parsed.Host
|
||||
}
|
||||
if splitHost, _, err := net.SplitHostPort(host); err == nil {
|
||||
host = splitHost
|
||||
}
|
||||
addr, err := netip.ParseAddr(strings.Trim(host, "[]"))
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return addr.IsPrivate() || addr.IsLoopback() || addr.IsLinkLocalUnicast()
|
||||
}
|
||||
|
||||
func lowerStringSet(values []string) map[string]bool {
|
||||
out := map[string]bool{}
|
||||
for _, value := range values {
|
||||
value = strings.ToLower(strings.TrimSpace(value))
|
||||
if value != "" {
|
||||
out[value] = true
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func firstNonEmpty(values ...string) string {
|
||||
for _, value := range values {
|
||||
if strings.TrimSpace(value) != "" {
|
||||
return strings.TrimSpace(value)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
@@ -0,0 +1,234 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestPeerConnectionIntentsClassifyCorporateDirect(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
|
||||
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "http://10.24.10.20:19001",
|
||||
BestTransport: "direct_tcp_tls",
|
||||
BestReachability: "private",
|
||||
BestConnectivity: "direct",
|
||||
BestPolicyTags: []string{"corp-lan", "same-site"},
|
||||
},
|
||||
}},
|
||||
RecoveryPlan: PeerRecoveryPlan{
|
||||
Mode: PeerRecoveryModeSteady,
|
||||
Candidates: []PeerRecoveryCandidate{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "http://10.24.10.20:19001",
|
||||
ConnectionState: PeerConnectionReady,
|
||||
Reason: "maintain_ready",
|
||||
Priority: 100,
|
||||
},
|
||||
},
|
||||
},
|
||||
Now: now,
|
||||
})
|
||||
|
||||
if plan.IntentCount != 1 || plan.MaintainCount != 1 || plan.CorporateLANCount != 1 {
|
||||
t.Fatalf("unexpected plan counts: %+v", plan)
|
||||
}
|
||||
intent := plan.Intents[0]
|
||||
if intent.Action != PeerConnectionIntentMaintain || intent.TransportMode != PeerTransportModeCorporateLAN || intent.RequiresRendezvous {
|
||||
t.Fatalf("unexpected corporate intent: %+v", intent)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionIntentsClassifyOutboundAndRelayAsRendezvousRequired(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
|
||||
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "https://node-b.example.test:443",
|
||||
BestTransport: "direct_tcp_tls",
|
||||
BestReachability: "outbound_only",
|
||||
BestConnectivity: "outbound_only",
|
||||
},
|
||||
{
|
||||
NodeID: "node-c",
|
||||
Endpoint: "relay://fabric-relay/node-c",
|
||||
BestTransport: "relay",
|
||||
BestReachability: "relay",
|
||||
BestConnectivity: "relay_required",
|
||||
},
|
||||
}},
|
||||
RecoveryPlan: PeerRecoveryPlan{
|
||||
Mode: PeerRecoveryModeRecovery,
|
||||
Candidates: []PeerRecoveryCandidate{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "https://node-b.example.test:443",
|
||||
ConnectionState: PeerConnectionDisconnected,
|
||||
Reason: "recover_warm",
|
||||
Priority: 90,
|
||||
},
|
||||
{
|
||||
NodeID: "node-c",
|
||||
Endpoint: "relay://fabric-relay/node-c",
|
||||
ConnectionState: PeerConnectionDisconnected,
|
||||
Reason: "recover_seed",
|
||||
Priority: 80,
|
||||
},
|
||||
},
|
||||
},
|
||||
Now: now,
|
||||
})
|
||||
|
||||
if plan.RecoverCount != 2 || plan.OutboundOnlyCount != 1 || plan.RelayRequiredCount != 1 || plan.RendezvousRequiredCount != 2 {
|
||||
t.Fatalf("unexpected rendezvous counts: %+v", plan)
|
||||
}
|
||||
if plan.Intents[0].Action != PeerConnectionIntentRecover || plan.Intents[1].Action != PeerConnectionIntentRecover {
|
||||
t.Fatalf("unexpected actions: %+v", plan.Intents)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
|
||||
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "relay://fabric/node-b",
|
||||
BestTransport: "relay",
|
||||
BestReachability: "relay",
|
||||
BestConnectivity: "relay_required",
|
||||
},
|
||||
}},
|
||||
RecoveryPlan: PeerRecoveryPlan{
|
||||
Mode: PeerRecoveryModeRecovery,
|
||||
Candidates: []PeerRecoveryCandidate{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "relay://fabric/node-b",
|
||||
ConnectionState: PeerConnectionDisconnected,
|
||||
Reason: "recover_warm",
|
||||
Priority: 100,
|
||||
},
|
||||
},
|
||||
},
|
||||
RendezvousLeases: []PeerRendezvousLease{
|
||||
{
|
||||
LeaseID: "lease-node-b-via-node-r",
|
||||
PeerNodeID: "node-b",
|
||||
RelayNodeID: "node-r",
|
||||
RelayEndpoint: "http://node-r:19000",
|
||||
Transport: "relay_control",
|
||||
ConnectivityMode: "relay_required",
|
||||
Priority: 10,
|
||||
ControlPlaneOnly: true,
|
||||
IssuedAt: now.Add(-time.Minute),
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
},
|
||||
},
|
||||
Now: now,
|
||||
})
|
||||
|
||||
if plan.IntentCount != 1 || plan.RelayControlCount != 1 || plan.RendezvousResolvedCount != 1 || plan.RendezvousRequiredCount != 0 {
|
||||
t.Fatalf("unexpected relay-control plan counts: %+v", plan)
|
||||
}
|
||||
intent := plan.Intents[0]
|
||||
if intent.TransportMode != PeerTransportModeRelayControl ||
|
||||
intent.Endpoint != "http://node-r:19000" ||
|
||||
intent.RelayNodeID != "node-r" ||
|
||||
intent.RendezvousLeaseID != "lease-node-b-via-node-r" ||
|
||||
!intent.RelayCandidate ||
|
||||
!intent.RendezvousResolved ||
|
||||
intent.RequiresRendezvous {
|
||||
t.Fatalf("unexpected resolved rendezvous intent: %+v", intent)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
|
||||
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "relay://fabric/node-b",
|
||||
BestTransport: "relay",
|
||||
BestReachability: "relay",
|
||||
BestConnectivity: "relay_required",
|
||||
},
|
||||
}},
|
||||
RecoveryPlan: PeerRecoveryPlan{
|
||||
Mode: PeerRecoveryModeRecovery,
|
||||
Candidates: []PeerRecoveryCandidate{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "relay://fabric/node-b",
|
||||
ConnectionState: PeerConnectionWaiting,
|
||||
Reason: "recover_warm",
|
||||
Priority: 100,
|
||||
},
|
||||
},
|
||||
},
|
||||
RendezvousLeases: []PeerRendezvousLease{
|
||||
{
|
||||
LeaseID: "lease-expired-preferred",
|
||||
PeerNodeID: "node-b",
|
||||
RelayNodeID: "node-r-old",
|
||||
RelayEndpoint: "http://node-r-old:19000",
|
||||
Transport: "relay_control",
|
||||
ConnectivityMode: "relay_required",
|
||||
Priority: 1,
|
||||
ControlPlaneOnly: true,
|
||||
IssuedAt: now.Add(-10 * time.Minute),
|
||||
ExpiresAt: now.Add(-time.Second),
|
||||
},
|
||||
{
|
||||
LeaseID: "lease-active-reselected",
|
||||
PeerNodeID: "node-b",
|
||||
RelayNodeID: "node-r-new",
|
||||
RelayEndpoint: "http://node-r-new:19000",
|
||||
Transport: "relay_control",
|
||||
ConnectivityMode: "relay_required",
|
||||
Priority: 20,
|
||||
ControlPlaneOnly: true,
|
||||
IssuedAt: now.Add(-time.Minute),
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
},
|
||||
},
|
||||
Now: now,
|
||||
})
|
||||
|
||||
if plan.RendezvousResolvedCount != 1 || plan.RelayControlCount != 1 || plan.RendezvousRequiredCount != 0 {
|
||||
t.Fatalf("unexpected reselected plan counts: %+v", plan)
|
||||
}
|
||||
intent := plan.Intents[0]
|
||||
if intent.RendezvousLeaseID != "lease-active-reselected" ||
|
||||
intent.RelayNodeID != "node-r-new" ||
|
||||
intent.Endpoint != "http://node-r-new:19000" {
|
||||
t.Fatalf("expired lease was not skipped: %+v", intent)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionIntentsClassifyPrivateEndpointWithoutCandidateHints(t *testing.T) {
|
||||
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
|
||||
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
|
||||
{NodeID: "node-b", Endpoint: "http://192.168.10.20:19001"},
|
||||
}},
|
||||
RecoveryPlan: PeerRecoveryPlan{Candidates: []PeerRecoveryCandidate{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "http://192.168.10.20:19001",
|
||||
ConnectionState: PeerConnectionDisconnected,
|
||||
Reason: "recover_peer",
|
||||
Priority: 10,
|
||||
},
|
||||
}},
|
||||
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
|
||||
})
|
||||
|
||||
if plan.PrivateLANCount != 1 || plan.Intents[0].TransportMode != PeerTransportModePrivateLAN || !plan.Intents[0].DirectCandidate {
|
||||
t.Fatalf("unexpected private endpoint classification: %+v", plan)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,304 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
PeerConnectionProbeReachable = "reachable"
|
||||
PeerConnectionProbeUnreachable = "unreachable"
|
||||
PeerConnectionProbeDeferred = "deferred"
|
||||
PeerConnectionProbeSkipped = "skipped"
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultPeerConnectionProbeTimeout = 2 * time.Second
|
||||
)
|
||||
|
||||
type PeerConnectionManagerConfig struct {
|
||||
Local PeerIdentity
|
||||
PeerCache *PeerCache
|
||||
Tracker *PeerConnectionTracker
|
||||
RendezvousLeases []PeerRendezvousLease
|
||||
HTTPClient *http.Client
|
||||
ProbeTimeout time.Duration
|
||||
Now func() time.Time
|
||||
}
|
||||
|
||||
type PeerConnectionManager struct {
|
||||
local PeerIdentity
|
||||
peerCache *PeerCache
|
||||
tracker *PeerConnectionTracker
|
||||
rendezvousLeases []PeerRendezvousLease
|
||||
httpClient *http.Client
|
||||
probeTimeout time.Duration
|
||||
now func() time.Time
|
||||
|
||||
mu sync.Mutex
|
||||
lastCycle PeerConnectionManagerCycle
|
||||
}
|
||||
|
||||
type PeerConnectionManagerCycle struct {
|
||||
Mode string `json:"mode"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
CompletedAt time.Time `json:"completed_at"`
|
||||
ProbeTimeoutMs int `json:"probe_timeout_ms"`
|
||||
IntentCount int `json:"intent_count"`
|
||||
Attempted int `json:"attempted"`
|
||||
Succeeded int `json:"succeeded"`
|
||||
Failed int `json:"failed"`
|
||||
Deferred int `json:"deferred"`
|
||||
Skipped int `json:"skipped"`
|
||||
RendezvousRequiredCount int `json:"rendezvous_required_count"`
|
||||
RendezvousResolvedCount int `json:"rendezvous_resolved_count"`
|
||||
RelayControlCount int `json:"relay_control_count"`
|
||||
RecoveryPlan PeerRecoveryPlan `json:"recovery_plan"`
|
||||
IntentPlan PeerConnectionIntentPlan `json:"intent_plan"`
|
||||
Results []PeerConnectionProbeResult `json:"results,omitempty"`
|
||||
}
|
||||
|
||||
type PeerConnectionManagerSnapshot struct {
|
||||
LastCycle PeerConnectionManagerCycle `json:"last_cycle"`
|
||||
}
|
||||
|
||||
type PeerConnectionProbeResult struct {
|
||||
NodeID string `json:"node_id"`
|
||||
LinkStatus string `json:"link_status"`
|
||||
Action string `json:"action"`
|
||||
Reason string `json:"reason"`
|
||||
Endpoint string `json:"endpoint,omitempty"`
|
||||
ConnectionState PeerConnectionState `json:"connection_state"`
|
||||
TransportMode string `json:"transport_mode"`
|
||||
RequiresRendezvous bool `json:"requires_rendezvous"`
|
||||
RendezvousResolved bool `json:"rendezvous_resolved"`
|
||||
DirectCandidate bool `json:"direct_candidate"`
|
||||
RelayCandidate bool `json:"relay_candidate"`
|
||||
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
|
||||
RelayNodeID string `json:"relay_node_id,omitempty"`
|
||||
RelayEndpoint string `json:"relay_endpoint,omitempty"`
|
||||
LatencyMs int `json:"latency_ms,omitempty"`
|
||||
FailureReason string `json:"failure_reason,omitempty"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
CompletedAt time.Time `json:"completed_at"`
|
||||
}
|
||||
|
||||
func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionManager {
|
||||
probeTimeout := cfg.ProbeTimeout
|
||||
if probeTimeout <= 0 {
|
||||
probeTimeout = DefaultPeerConnectionProbeTimeout
|
||||
}
|
||||
httpClient := cfg.HTTPClient
|
||||
if httpClient == nil {
|
||||
httpClient = &http.Client{
|
||||
Transport: &http.Transport{
|
||||
MaxIdleConns: 64,
|
||||
MaxIdleConnsPerHost: 8,
|
||||
IdleConnTimeout: 90 * time.Second,
|
||||
},
|
||||
Timeout: probeTimeout + time.Second,
|
||||
}
|
||||
}
|
||||
now := cfg.Now
|
||||
if now == nil {
|
||||
now = func() time.Time { return time.Now().UTC() }
|
||||
}
|
||||
return &PeerConnectionManager{
|
||||
local: cfg.Local,
|
||||
peerCache: cfg.PeerCache,
|
||||
tracker: cfg.Tracker,
|
||||
rendezvousLeases: append([]PeerRendezvousLease{}, cfg.RendezvousLeases...),
|
||||
httpClient: httpClient,
|
||||
probeTimeout: probeTimeout,
|
||||
now: now,
|
||||
}
|
||||
}
|
||||
|
||||
func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionManagerCycle {
|
||||
peerCache, rendezvousLeases := m.peerConfigSnapshot()
|
||||
if m == nil || peerCache == nil || m.tracker == nil {
|
||||
return PeerConnectionManagerCycle{}
|
||||
}
|
||||
startedAt := normalizedNow(m.now())
|
||||
peerSnapshot := peerCache.Snapshot()
|
||||
recoveryPlan := PlanPeerRecovery(PeerRecoveryPlanConfig{
|
||||
PeerCache: peerSnapshot,
|
||||
Connections: m.tracker.Snapshot(),
|
||||
TargetReadyPeers: DefaultStablePeerTarget,
|
||||
MaxProbeCandidates: DefaultRecoveryProbeLimit,
|
||||
Now: startedAt,
|
||||
})
|
||||
intentPlan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
|
||||
PeerCache: peerSnapshot,
|
||||
RecoveryPlan: recoveryPlan,
|
||||
RendezvousLeases: rendezvousLeases,
|
||||
Now: startedAt,
|
||||
})
|
||||
cycle := PeerConnectionManagerCycle{
|
||||
Mode: recoveryPlan.Mode,
|
||||
StartedAt: startedAt,
|
||||
ProbeTimeoutMs: int(m.probeTimeout.Milliseconds()),
|
||||
IntentCount: intentPlan.IntentCount,
|
||||
RendezvousRequiredCount: intentPlan.RendezvousRequiredCount,
|
||||
RendezvousResolvedCount: intentPlan.RendezvousResolvedCount,
|
||||
RelayControlCount: intentPlan.RelayControlCount,
|
||||
RecoveryPlan: recoveryPlan,
|
||||
IntentPlan: intentPlan,
|
||||
Results: make([]PeerConnectionProbeResult, 0, len(intentPlan.Intents)),
|
||||
}
|
||||
for _, intent := range intentPlan.Intents {
|
||||
result := m.probeIntent(ctx, intent)
|
||||
cycle.Results = append(cycle.Results, result)
|
||||
switch result.LinkStatus {
|
||||
case PeerConnectionProbeReachable:
|
||||
cycle.Attempted++
|
||||
cycle.Succeeded++
|
||||
case PeerConnectionProbeUnreachable:
|
||||
cycle.Attempted++
|
||||
cycle.Failed++
|
||||
case PeerConnectionProbeDeferred:
|
||||
cycle.Deferred++
|
||||
case PeerConnectionProbeSkipped:
|
||||
cycle.Skipped++
|
||||
}
|
||||
}
|
||||
cycle.CompletedAt = normalizedNow(m.now())
|
||||
m.mu.Lock()
|
||||
m.lastCycle = cycle
|
||||
m.mu.Unlock()
|
||||
return cycle
|
||||
}
|
||||
|
||||
func (m *PeerConnectionManager) Snapshot() PeerConnectionManagerSnapshot {
|
||||
if m == nil {
|
||||
return PeerConnectionManagerSnapshot{}
|
||||
}
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
return PeerConnectionManagerSnapshot{LastCycle: m.lastCycle}
|
||||
}
|
||||
|
||||
func (m *PeerConnectionManager) UpdatePeerConfig(peerCache *PeerCache, rendezvousLeases []PeerRendezvousLease) {
|
||||
if m == nil {
|
||||
return
|
||||
}
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
m.peerCache = peerCache
|
||||
m.rendezvousLeases = append([]PeerRendezvousLease{}, rendezvousLeases...)
|
||||
}
|
||||
|
||||
func (m *PeerConnectionManager) peerConfigSnapshot() (*PeerCache, []PeerRendezvousLease) {
|
||||
if m == nil {
|
||||
return nil, nil
|
||||
}
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
return m.peerCache, append([]PeerRendezvousLease{}, m.rendezvousLeases...)
|
||||
}
|
||||
|
||||
func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConnectionIntent) PeerConnectionProbeResult {
|
||||
startedAt := normalizedNow(m.now())
|
||||
result := PeerConnectionProbeResult{
|
||||
NodeID: intent.NodeID,
|
||||
Action: intent.Action,
|
||||
Reason: intent.Reason,
|
||||
Endpoint: intent.Endpoint,
|
||||
TransportMode: intent.TransportMode,
|
||||
RequiresRendezvous: intent.RequiresRendezvous,
|
||||
RendezvousResolved: intent.RendezvousResolved,
|
||||
DirectCandidate: intent.DirectCandidate,
|
||||
RelayCandidate: intent.RelayCandidate,
|
||||
RendezvousLeaseID: intent.RendezvousLeaseID,
|
||||
RelayNodeID: intent.RelayNodeID,
|
||||
RelayEndpoint: intent.RelayEndpoint,
|
||||
StartedAt: startedAt,
|
||||
}
|
||||
peer := PeerCacheEntry{
|
||||
NodeID: intent.NodeID,
|
||||
Endpoint: intent.Endpoint,
|
||||
Warm: true,
|
||||
WarmReason: intent.Reason,
|
||||
RecoverySeed: intent.RecoverySeed,
|
||||
BestCandidateID: intent.BestCandidateID,
|
||||
BestTransport: intent.Transport,
|
||||
RendezvousLeaseID: intent.RendezvousLeaseID,
|
||||
RelayNodeID: intent.RelayNodeID,
|
||||
RelayEndpoint: intent.RelayEndpoint,
|
||||
RelayControl: intent.RelayCandidate,
|
||||
}
|
||||
if intent.RequiresRendezvous {
|
||||
result.LinkStatus = PeerConnectionProbeDeferred
|
||||
result.FailureReason = "rendezvous_required"
|
||||
result.ConnectionState = m.tracker.RecordDeferred(peer, result.FailureReason, startedAt)
|
||||
result.CompletedAt = normalizedNow(m.now())
|
||||
return result
|
||||
}
|
||||
if strings.TrimSpace(intent.Endpoint) == "" || (!intent.DirectCandidate && !intent.RelayCandidate) {
|
||||
result.LinkStatus = PeerConnectionProbeDeferred
|
||||
result.FailureReason = "direct_candidate_unavailable"
|
||||
if intent.RelayCandidate {
|
||||
result.FailureReason = "relay_candidate_unavailable"
|
||||
}
|
||||
result.ConnectionState = m.tracker.RecordDeferred(peer, result.FailureReason, startedAt)
|
||||
result.CompletedAt = normalizedNow(m.now())
|
||||
return result
|
||||
}
|
||||
if !m.tracker.ShouldProbe(intent.NodeID, startedAt) {
|
||||
result.LinkStatus = PeerConnectionProbeSkipped
|
||||
result.FailureReason = "backoff_active"
|
||||
result.ConnectionState = m.connectionState(intent.NodeID)
|
||||
result.CompletedAt = normalizedNow(m.now())
|
||||
return result
|
||||
}
|
||||
m.tracker.BeginProbe(peer, startedAt)
|
||||
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
|
||||
defer cancel()
|
||||
target := PeerIdentity{
|
||||
ClusterID: m.local.ClusterID,
|
||||
NodeID: intent.NodeID,
|
||||
}
|
||||
if intent.RelayCandidate && intent.RelayNodeID != "" {
|
||||
target.NodeID = intent.RelayNodeID
|
||||
}
|
||||
_, err := NewClient(strings.TrimRight(intent.Endpoint, "/")).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
|
||||
completedAt := normalizedNow(m.now())
|
||||
if err != nil {
|
||||
result.LinkStatus = PeerConnectionProbeUnreachable
|
||||
result.FailureReason = err.Error()
|
||||
result.ConnectionState = m.tracker.RecordFailure(intent.NodeID, err.Error(), completedAt)
|
||||
result.CompletedAt = completedAt
|
||||
return result
|
||||
}
|
||||
latency := int(completedAt.Sub(startedAt).Milliseconds())
|
||||
if latency < 0 {
|
||||
latency = 0
|
||||
}
|
||||
result.LinkStatus = PeerConnectionProbeReachable
|
||||
result.LatencyMs = latency
|
||||
if intent.RelayCandidate {
|
||||
result.ConnectionState = m.tracker.RecordRelayReady(peer, latency, completedAt)
|
||||
} else {
|
||||
result.ConnectionState = m.tracker.RecordSuccess(intent.NodeID, latency, completedAt)
|
||||
}
|
||||
result.CompletedAt = completedAt
|
||||
return result
|
||||
}
|
||||
|
||||
func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionState {
|
||||
snapshot := m.tracker.Snapshot()
|
||||
for _, entry := range snapshot.Entries {
|
||||
if entry.NodeID == nodeID {
|
||||
return entry
|
||||
}
|
||||
}
|
||||
return PeerConnectionState{NodeID: nodeID, State: PeerConnectionDisconnected}
|
||||
}
|
||||
|
||||
func (c Client) withHTTPClient(httpClient *http.Client) Client {
|
||||
c.HTTPClient = httpClient
|
||||
return c
|
||||
}
|
||||
@@ -0,0 +1,190 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestPeerConnectionManagerProbesDirectAndDefersRendezvous(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
current := now
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
Local: local,
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
{
|
||||
EndpointID: "node-b-direct",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: server.URL,
|
||||
Reachability: "private",
|
||||
ConnectivityMode: "direct",
|
||||
PolicyTags: []string{"corp-lan", "same-site"},
|
||||
Priority: 1,
|
||||
},
|
||||
},
|
||||
"node-c": {
|
||||
{
|
||||
EndpointID: "node-c-relay",
|
||||
NodeID: "node-c",
|
||||
Transport: "relay",
|
||||
Address: "relay://fabric/node-c",
|
||||
Reachability: "relay",
|
||||
ConnectivityMode: "relay_required",
|
||||
Priority: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
WarmPeerLimit: 2,
|
||||
Now: now,
|
||||
})
|
||||
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
|
||||
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
|
||||
Local: local,
|
||||
PeerCache: cache,
|
||||
Tracker: tracker,
|
||||
ProbeTimeout: time.Second,
|
||||
Now: func() time.Time {
|
||||
current = current.Add(10 * time.Millisecond)
|
||||
return current
|
||||
},
|
||||
})
|
||||
|
||||
cycle := manager.ProbeOnce(context.Background())
|
||||
if cycle.Attempted != 1 || cycle.Succeeded != 1 || cycle.Deferred != 1 || cycle.RendezvousRequiredCount != 1 {
|
||||
t.Fatalf("unexpected cycle: %+v", cycle)
|
||||
}
|
||||
snapshot := tracker.Snapshot()
|
||||
if snapshot.Ready != 1 || snapshot.Waiting != 1 {
|
||||
t.Fatalf("unexpected tracker snapshot: %+v", snapshot)
|
||||
}
|
||||
if cycle.Results[0].NodeID != "node-b" || cycle.Results[0].LinkStatus != PeerConnectionProbeReachable {
|
||||
t.Fatalf("direct peer was not probed first: %+v", cycle.Results)
|
||||
}
|
||||
if cycle.Results[1].NodeID != "node-c" || cycle.Results[1].LinkStatus != PeerConnectionProbeDeferred {
|
||||
t.Fatalf("relay peer was not deferred: %+v", cycle.Results)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
current := now
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
Local: local,
|
||||
PeerEndpoints: map[string]string{
|
||||
"node-b": "http://127.0.0.1:1",
|
||||
},
|
||||
WarmPeerLimit: 1,
|
||||
Now: now,
|
||||
})
|
||||
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
|
||||
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
|
||||
Local: local,
|
||||
PeerCache: cache,
|
||||
Tracker: tracker,
|
||||
HTTPClient: &http.Client{Timeout: 20 * time.Millisecond},
|
||||
ProbeTimeout: 20 * time.Millisecond,
|
||||
Now: func() time.Time {
|
||||
current = current.Add(10 * time.Millisecond)
|
||||
return current
|
||||
},
|
||||
})
|
||||
|
||||
for i := 0; i < 3; i++ {
|
||||
manager.ProbeOnce(context.Background())
|
||||
}
|
||||
backoff := tracker.Snapshot()
|
||||
if backoff.Backoff != 1 {
|
||||
t.Fatalf("expected backoff after repeated failures: %+v", backoff)
|
||||
}
|
||||
cycle := manager.ProbeOnce(context.Background())
|
||||
if cycle.Attempted != 0 || len(cycle.Results) != 0 {
|
||||
t.Fatalf("active backoff peer should not be attempted: %+v", cycle)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
current := now
|
||||
server := httptest.NewServer(Server{
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
|
||||
leases := []PeerRendezvousLease{
|
||||
{
|
||||
LeaseID: "lease-node-b-via-node-r",
|
||||
PeerNodeID: "node-b",
|
||||
RelayNodeID: "node-r",
|
||||
RelayEndpoint: server.URL,
|
||||
Transport: "relay_control",
|
||||
ConnectivityMode: "relay_required",
|
||||
Priority: 10,
|
||||
ControlPlaneOnly: true,
|
||||
IssuedAt: now.Add(-time.Minute),
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
},
|
||||
}
|
||||
cache := NewPeerCache(PeerCacheConfig{
|
||||
Local: local,
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
{
|
||||
EndpointID: "node-b-relay",
|
||||
NodeID: "node-b",
|
||||
Transport: "relay",
|
||||
Address: "relay://fabric/node-b",
|
||||
Reachability: "relay",
|
||||
ConnectivityMode: "relay_required",
|
||||
Priority: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
RendezvousLeases: leases,
|
||||
WarmPeerLimit: 1,
|
||||
Now: now,
|
||||
})
|
||||
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
|
||||
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
|
||||
Local: local,
|
||||
PeerCache: cache,
|
||||
Tracker: tracker,
|
||||
RendezvousLeases: leases,
|
||||
ProbeTimeout: time.Second,
|
||||
Now: func() time.Time {
|
||||
current = current.Add(10 * time.Millisecond)
|
||||
return current
|
||||
},
|
||||
})
|
||||
|
||||
cycle := manager.ProbeOnce(context.Background())
|
||||
if cycle.Attempted != 1 ||
|
||||
cycle.Succeeded != 1 ||
|
||||
cycle.Deferred != 0 ||
|
||||
cycle.RelayControlCount != 1 ||
|
||||
cycle.RendezvousResolvedCount != 1 ||
|
||||
cycle.RendezvousRequiredCount != 0 {
|
||||
t.Fatalf("unexpected relay-control cycle: %+v", cycle)
|
||||
}
|
||||
if len(cycle.Results) != 1 ||
|
||||
cycle.Results[0].NodeID != "node-b" ||
|
||||
cycle.Results[0].RelayNodeID != "node-r" ||
|
||||
cycle.Results[0].ConnectionState.State != PeerConnectionRelayReady {
|
||||
t.Fatalf("unexpected relay-control result: %+v", cycle.Results)
|
||||
}
|
||||
snapshot := tracker.Snapshot()
|
||||
if snapshot.RelayReady != 1 || snapshot.Waiting != 0 {
|
||||
t.Fatalf("unexpected tracker snapshot: %+v", snapshot)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,284 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
PeerConnectionDisconnected = "disconnected"
|
||||
PeerConnectionConnecting = "connecting"
|
||||
PeerConnectionReady = "ready"
|
||||
PeerConnectionRelayReady = "relay_ready"
|
||||
PeerConnectionDegraded = "degraded"
|
||||
PeerConnectionBackoff = "backoff"
|
||||
PeerConnectionWaiting = "waiting_rendezvous"
|
||||
)
|
||||
|
||||
const (
|
||||
peerConnectionBackoffBase = 5 * time.Second
|
||||
peerConnectionBackoffMax = time.Minute
|
||||
)
|
||||
|
||||
type PeerConnectionTracker struct {
|
||||
mu sync.Mutex
|
||||
entries map[string]PeerConnectionState
|
||||
}
|
||||
|
||||
type PeerConnectionState struct {
|
||||
NodeID string `json:"node_id"`
|
||||
State string `json:"state"`
|
||||
Warm bool `json:"warm"`
|
||||
WarmReason string `json:"warm_reason,omitempty"`
|
||||
Endpoint string `json:"endpoint,omitempty"`
|
||||
BestCandidateID string `json:"best_candidate_id,omitempty"`
|
||||
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
|
||||
RelayNodeID string `json:"relay_node_id,omitempty"`
|
||||
RelayEndpoint string `json:"relay_endpoint,omitempty"`
|
||||
RelayControl bool `json:"relay_control"`
|
||||
ConsecutiveSuccesses int `json:"consecutive_successes"`
|
||||
ConsecutiveFailures int `json:"consecutive_failures"`
|
||||
LastLatencyMs int `json:"last_latency_ms,omitempty"`
|
||||
LastFailureReason string `json:"last_failure_reason,omitempty"`
|
||||
LastTransitionAt time.Time `json:"last_transition_at"`
|
||||
LastProbeAt time.Time `json:"last_probe_at,omitempty"`
|
||||
BackoffUntil time.Time `json:"backoff_until,omitempty"`
|
||||
}
|
||||
|
||||
type PeerConnectionSnapshot struct {
|
||||
Total int `json:"total"`
|
||||
Ready int `json:"ready"`
|
||||
RelayReady int `json:"relay_ready"`
|
||||
Degraded int `json:"degraded"`
|
||||
Backoff int `json:"backoff"`
|
||||
Waiting int `json:"waiting_rendezvous"`
|
||||
Connecting int `json:"connecting"`
|
||||
Disconnected int `json:"disconnected"`
|
||||
StateCounts map[string]int `json:"state_counts"`
|
||||
Entries []PeerConnectionState `json:"entries"`
|
||||
LastTransitionAt time.Time `json:"last_transition_at,omitempty"`
|
||||
}
|
||||
|
||||
func NewPeerConnectionTracker(peerSnapshot PeerCacheSnapshot, now time.Time) *PeerConnectionTracker {
|
||||
now = normalizedNow(now)
|
||||
tracker := &PeerConnectionTracker{entries: map[string]PeerConnectionState{}}
|
||||
for _, peer := range peerSnapshot.Entries {
|
||||
if !peer.Warm || peer.NodeID == "" {
|
||||
continue
|
||||
}
|
||||
tracker.entries[peer.NodeID] = PeerConnectionState{
|
||||
NodeID: peer.NodeID,
|
||||
State: PeerConnectionDisconnected,
|
||||
Warm: peer.Warm,
|
||||
WarmReason: peer.WarmReason,
|
||||
Endpoint: peer.Endpoint,
|
||||
BestCandidateID: peer.BestCandidateID,
|
||||
LastTransitionAt: now,
|
||||
}
|
||||
}
|
||||
return tracker
|
||||
}
|
||||
|
||||
func (t *PeerConnectionTracker) ShouldProbe(nodeID string, now time.Time) bool {
|
||||
if t == nil {
|
||||
return true
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
entry, ok := t.entries[nodeID]
|
||||
if !ok {
|
||||
return true
|
||||
}
|
||||
now = normalizedNow(now)
|
||||
return entry.State != PeerConnectionBackoff || entry.BackoffUntil.IsZero() || !entry.BackoffUntil.After(now)
|
||||
}
|
||||
|
||||
func (t *PeerConnectionTracker) BeginProbe(peer PeerCacheEntry, now time.Time) PeerConnectionState {
|
||||
if t == nil {
|
||||
return PeerConnectionState{}
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
now = normalizedNow(now)
|
||||
entry := t.entry(peer, now)
|
||||
if entry.State != PeerConnectionReady && entry.State != PeerConnectionDegraded {
|
||||
entry.State = PeerConnectionConnecting
|
||||
entry.LastTransitionAt = now
|
||||
}
|
||||
entry.LastProbeAt = now
|
||||
t.entries[peer.NodeID] = entry
|
||||
return entry
|
||||
}
|
||||
|
||||
func (t *PeerConnectionTracker) RecordSuccess(nodeID string, latencyMs int, now time.Time) PeerConnectionState {
|
||||
if t == nil {
|
||||
return PeerConnectionState{}
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
now = normalizedNow(now)
|
||||
entry := t.entries[nodeID]
|
||||
entry.NodeID = nodeID
|
||||
entry.ConsecutiveSuccesses++
|
||||
entry.ConsecutiveFailures = 0
|
||||
entry.LastLatencyMs = latencyMs
|
||||
entry.LastFailureReason = ""
|
||||
entry.LastProbeAt = now
|
||||
entry.BackoffUntil = time.Time{}
|
||||
nextState := PeerConnectionReady
|
||||
if latencyMs >= 500 {
|
||||
nextState = PeerConnectionDegraded
|
||||
}
|
||||
if entry.State != nextState {
|
||||
entry.State = nextState
|
||||
entry.LastTransitionAt = now
|
||||
}
|
||||
t.entries[nodeID] = entry
|
||||
return entry
|
||||
}
|
||||
|
||||
func (t *PeerConnectionTracker) RecordRelayReady(peer PeerCacheEntry, latencyMs int, now time.Time) PeerConnectionState {
|
||||
if t == nil {
|
||||
return PeerConnectionState{}
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
now = normalizedNow(now)
|
||||
entry := t.entry(peer, now)
|
||||
entry.ConsecutiveSuccesses++
|
||||
entry.ConsecutiveFailures = 0
|
||||
entry.LastLatencyMs = latencyMs
|
||||
entry.LastFailureReason = ""
|
||||
entry.LastProbeAt = now
|
||||
entry.BackoffUntil = time.Time{}
|
||||
if entry.State != PeerConnectionRelayReady {
|
||||
entry.State = PeerConnectionRelayReady
|
||||
entry.LastTransitionAt = now
|
||||
}
|
||||
t.entries[peer.NodeID] = entry
|
||||
return entry
|
||||
}
|
||||
|
||||
func (t *PeerConnectionTracker) RecordFailure(nodeID string, reason string, now time.Time) PeerConnectionState {
|
||||
if t == nil {
|
||||
return PeerConnectionState{}
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
now = normalizedNow(now)
|
||||
entry := t.entries[nodeID]
|
||||
entry.NodeID = nodeID
|
||||
entry.ConsecutiveFailures++
|
||||
entry.ConsecutiveSuccesses = 0
|
||||
entry.LastFailureReason = reason
|
||||
entry.LastProbeAt = now
|
||||
nextState := PeerConnectionDegraded
|
||||
if entry.ConsecutiveFailures >= 3 {
|
||||
nextState = PeerConnectionBackoff
|
||||
entry.BackoffUntil = now.Add(peerConnectionBackoffDuration(entry.ConsecutiveFailures))
|
||||
}
|
||||
if entry.State != nextState {
|
||||
entry.State = nextState
|
||||
entry.LastTransitionAt = now
|
||||
}
|
||||
t.entries[nodeID] = entry
|
||||
return entry
|
||||
}
|
||||
|
||||
func (t *PeerConnectionTracker) RecordDeferred(peer PeerCacheEntry, reason string, now time.Time) PeerConnectionState {
|
||||
if t == nil {
|
||||
return PeerConnectionState{}
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
now = normalizedNow(now)
|
||||
entry := t.entry(peer, now)
|
||||
entry.State = PeerConnectionWaiting
|
||||
entry.LastFailureReason = reason
|
||||
entry.LastProbeAt = time.Time{}
|
||||
entry.LastTransitionAt = now
|
||||
entry.BackoffUntil = time.Time{}
|
||||
t.entries[peer.NodeID] = entry
|
||||
return entry
|
||||
}
|
||||
|
||||
func (t *PeerConnectionTracker) Snapshot() PeerConnectionSnapshot {
|
||||
if t == nil {
|
||||
return PeerConnectionSnapshot{StateCounts: map[string]int{}}
|
||||
}
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
entries := make([]PeerConnectionState, 0, len(t.entries))
|
||||
counts := map[string]int{
|
||||
PeerConnectionDisconnected: 0,
|
||||
PeerConnectionConnecting: 0,
|
||||
PeerConnectionReady: 0,
|
||||
PeerConnectionRelayReady: 0,
|
||||
PeerConnectionDegraded: 0,
|
||||
PeerConnectionBackoff: 0,
|
||||
PeerConnectionWaiting: 0,
|
||||
}
|
||||
var lastTransition time.Time
|
||||
for _, entry := range t.entries {
|
||||
entries = append(entries, entry)
|
||||
counts[entry.State]++
|
||||
if entry.LastTransitionAt.After(lastTransition) {
|
||||
lastTransition = entry.LastTransitionAt
|
||||
}
|
||||
}
|
||||
sort.SliceStable(entries, func(i, j int) bool {
|
||||
return entries[i].NodeID < entries[j].NodeID
|
||||
})
|
||||
return PeerConnectionSnapshot{
|
||||
Total: len(entries),
|
||||
Ready: counts[PeerConnectionReady],
|
||||
RelayReady: counts[PeerConnectionRelayReady],
|
||||
Degraded: counts[PeerConnectionDegraded],
|
||||
Backoff: counts[PeerConnectionBackoff],
|
||||
Waiting: counts[PeerConnectionWaiting],
|
||||
Connecting: counts[PeerConnectionConnecting],
|
||||
Disconnected: counts[PeerConnectionDisconnected],
|
||||
StateCounts: counts,
|
||||
Entries: entries,
|
||||
LastTransitionAt: lastTransition,
|
||||
}
|
||||
}
|
||||
|
||||
func (t *PeerConnectionTracker) entry(peer PeerCacheEntry, now time.Time) PeerConnectionState {
|
||||
entry, ok := t.entries[peer.NodeID]
|
||||
if !ok {
|
||||
entry = PeerConnectionState{
|
||||
NodeID: peer.NodeID,
|
||||
State: PeerConnectionDisconnected,
|
||||
LastTransitionAt: now,
|
||||
}
|
||||
}
|
||||
entry.Warm = peer.Warm
|
||||
entry.WarmReason = peer.WarmReason
|
||||
entry.Endpoint = peer.Endpoint
|
||||
entry.BestCandidateID = peer.BestCandidateID
|
||||
entry.RendezvousLeaseID = peer.RendezvousLeaseID
|
||||
entry.RelayNodeID = peer.RelayNodeID
|
||||
entry.RelayEndpoint = peer.RelayEndpoint
|
||||
entry.RelayControl = peer.RelayControl
|
||||
return entry
|
||||
}
|
||||
|
||||
func peerConnectionBackoffDuration(failures int) time.Duration {
|
||||
if failures < 3 {
|
||||
return 0
|
||||
}
|
||||
backoff := peerConnectionBackoffBase * time.Duration(failures-2)
|
||||
if backoff > peerConnectionBackoffMax {
|
||||
return peerConnectionBackoffMax
|
||||
}
|
||||
return backoff
|
||||
}
|
||||
|
||||
func normalizedNow(now time.Time) time.Time {
|
||||
if now.IsZero() {
|
||||
return time.Now().UTC()
|
||||
}
|
||||
return now.UTC()
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestPeerConnectionTrackerTransitionsReadyAndDegraded(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
tracker := NewPeerConnectionTracker(PeerCacheSnapshot{
|
||||
Entries: []PeerCacheEntry{
|
||||
{NodeID: "node-b", Warm: true, WarmReason: "route_adjacent", Endpoint: "http://node-b:19000"},
|
||||
},
|
||||
}, now)
|
||||
|
||||
begin := tracker.BeginProbe(PeerCacheEntry{NodeID: "node-b", Warm: true}, now.Add(time.Second))
|
||||
if begin.State != PeerConnectionConnecting {
|
||||
t.Fatalf("begin state = %q, want connecting", begin.State)
|
||||
}
|
||||
ready := tracker.RecordSuccess("node-b", 42, now.Add(2*time.Second))
|
||||
if ready.State != PeerConnectionReady || ready.ConsecutiveSuccesses != 1 || ready.ConsecutiveFailures != 0 {
|
||||
t.Fatalf("ready state unexpected: %+v", ready)
|
||||
}
|
||||
degraded := tracker.RecordSuccess("node-b", 800, now.Add(3*time.Second))
|
||||
if degraded.State != PeerConnectionDegraded || degraded.LastLatencyMs != 800 {
|
||||
t.Fatalf("degraded state unexpected: %+v", degraded)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionTrackerBackoffAfterRepeatedFailures(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
tracker := NewPeerConnectionTracker(PeerCacheSnapshot{
|
||||
Entries: []PeerCacheEntry{{NodeID: "node-b", Warm: true}},
|
||||
}, now)
|
||||
|
||||
first := tracker.RecordFailure("node-b", "timeout", now.Add(time.Second))
|
||||
if first.State != PeerConnectionDegraded {
|
||||
t.Fatalf("first failure state = %q, want degraded", first.State)
|
||||
}
|
||||
_ = tracker.RecordFailure("node-b", "timeout", now.Add(2*time.Second))
|
||||
third := tracker.RecordFailure("node-b", "timeout", now.Add(3*time.Second))
|
||||
if third.State != PeerConnectionBackoff || third.BackoffUntil.IsZero() {
|
||||
t.Fatalf("third failure did not enter backoff: %+v", third)
|
||||
}
|
||||
if tracker.ShouldProbe("node-b", now.Add(4*time.Second)) {
|
||||
t.Fatal("ShouldProbe returned true during backoff")
|
||||
}
|
||||
if !tracker.ShouldProbe("node-b", third.BackoffUntil.Add(time.Millisecond)) {
|
||||
t.Fatal("ShouldProbe returned false after backoff")
|
||||
}
|
||||
recovered := tracker.RecordSuccess("node-b", 12, third.BackoffUntil.Add(time.Second))
|
||||
if recovered.State != PeerConnectionReady || recovered.ConsecutiveFailures != 0 || !recovered.BackoffUntil.IsZero() {
|
||||
t.Fatalf("success did not recover from backoff: %+v", recovered)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerConnectionTrackerSnapshotCountsStates(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
tracker := NewPeerConnectionTracker(PeerCacheSnapshot{
|
||||
Entries: []PeerCacheEntry{
|
||||
{NodeID: "node-a", Warm: true},
|
||||
{NodeID: "node-b", Warm: true},
|
||||
{NodeID: "node-c", Warm: true},
|
||||
},
|
||||
}, now)
|
||||
tracker.RecordSuccess("node-a", 25, now.Add(time.Second))
|
||||
tracker.RecordFailure("node-b", "timeout", now.Add(time.Second))
|
||||
tracker.RecordFailure("node-c", "timeout", now.Add(time.Second))
|
||||
tracker.RecordFailure("node-c", "timeout", now.Add(2*time.Second))
|
||||
tracker.RecordFailure("node-c", "timeout", now.Add(3*time.Second))
|
||||
|
||||
snapshot := tracker.Snapshot()
|
||||
if snapshot.Total != 3 || snapshot.Ready != 1 || snapshot.Degraded != 1 || snapshot.Backoff != 1 {
|
||||
t.Fatalf("unexpected snapshot: %+v", snapshot)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,276 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const (
|
||||
PeerRecoveryModeSteady = "steady"
|
||||
PeerRecoveryModeRecovery = "recovery"
|
||||
)
|
||||
|
||||
const (
|
||||
DefaultStablePeerTarget = 3
|
||||
DefaultRecoveryProbeLimit = 6
|
||||
)
|
||||
|
||||
type PeerRecoveryPlanConfig struct {
|
||||
PeerCache PeerCacheSnapshot
|
||||
Connections PeerConnectionSnapshot
|
||||
TargetReadyPeers int
|
||||
MaxProbeCandidates int
|
||||
Now time.Time
|
||||
}
|
||||
|
||||
type PeerRecoveryPlan struct {
|
||||
Mode string `json:"mode"`
|
||||
Healthy bool `json:"healthy"`
|
||||
TargetReadyPeers int `json:"target_ready_peers"`
|
||||
ReadyPeerCount int `json:"ready_peer_count"`
|
||||
DegradedPeerCount int `json:"degraded_peer_count"`
|
||||
BackoffPeerCount int `json:"backoff_peer_count"`
|
||||
ConnectablePeerCount int `json:"connectable_peer_count"`
|
||||
Deficit int `json:"deficit"`
|
||||
ProbeCandidateCount int `json:"probe_candidate_count"`
|
||||
RecoverySeedCandidateCount int `json:"recovery_seed_candidate_count"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Candidates []PeerRecoveryCandidate `json:"candidates,omitempty"`
|
||||
}
|
||||
|
||||
type PeerRecoveryCandidate struct {
|
||||
NodeID string `json:"node_id"`
|
||||
Endpoint string `json:"endpoint,omitempty"`
|
||||
Warm bool `json:"warm"`
|
||||
WarmReason string `json:"warm_reason,omitempty"`
|
||||
RecoverySeed bool `json:"recovery_seed"`
|
||||
BestCandidateID string `json:"best_candidate_id,omitempty"`
|
||||
BestTransport string `json:"best_transport,omitempty"`
|
||||
ConnectionState string `json:"connection_state"`
|
||||
ConsecutiveFailures int `json:"consecutive_failures,omitempty"`
|
||||
LastLatencyMs int `json:"last_latency_ms,omitempty"`
|
||||
BackoffUntil time.Time `json:"backoff_until,omitempty"`
|
||||
Reason string `json:"reason"`
|
||||
Priority int `json:"priority"`
|
||||
}
|
||||
|
||||
type peerRecoveryCandidateBuild struct {
|
||||
PeerRecoveryCandidate
|
||||
}
|
||||
|
||||
func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
|
||||
now := normalizedNow(cfg.Now)
|
||||
target := cfg.TargetReadyPeers
|
||||
if target <= 0 {
|
||||
target = DefaultStablePeerTarget
|
||||
}
|
||||
limit := cfg.MaxProbeCandidates
|
||||
if limit <= 0 {
|
||||
limit = DefaultRecoveryProbeLimit
|
||||
}
|
||||
connectable := connectablePeerCount(cfg.PeerCache)
|
||||
if target > connectable {
|
||||
target = connectable
|
||||
}
|
||||
if limit < target {
|
||||
limit = target
|
||||
}
|
||||
|
||||
connectionByNode := map[string]PeerConnectionState{}
|
||||
for _, connection := range cfg.Connections.Entries {
|
||||
if strings.TrimSpace(connection.NodeID) == "" {
|
||||
continue
|
||||
}
|
||||
connectionByNode[connection.NodeID] = connection
|
||||
}
|
||||
|
||||
entryByNode := map[string]PeerCacheEntry{}
|
||||
for _, entry := range cfg.PeerCache.Entries {
|
||||
if strings.TrimSpace(entry.NodeID) == "" {
|
||||
continue
|
||||
}
|
||||
entryByNode[entry.NodeID] = entry
|
||||
}
|
||||
|
||||
ready := 0
|
||||
degraded := 0
|
||||
backoff := 0
|
||||
for nodeID, connection := range connectionByNode {
|
||||
entry, ok := entryByNode[nodeID]
|
||||
if !ok || strings.TrimSpace(entry.Endpoint) == "" {
|
||||
continue
|
||||
}
|
||||
switch connection.State {
|
||||
case PeerConnectionReady, PeerConnectionRelayReady:
|
||||
ready++
|
||||
case PeerConnectionDegraded:
|
||||
degraded++
|
||||
case PeerConnectionBackoff:
|
||||
backoff++
|
||||
}
|
||||
}
|
||||
|
||||
deficit := target - ready
|
||||
if deficit < 0 {
|
||||
deficit = 0
|
||||
}
|
||||
mode := PeerRecoveryModeSteady
|
||||
if deficit > 0 {
|
||||
mode = PeerRecoveryModeRecovery
|
||||
}
|
||||
if mode == PeerRecoveryModeSteady {
|
||||
limit = target
|
||||
}
|
||||
|
||||
candidates := make([]peerRecoveryCandidateBuild, 0, len(cfg.PeerCache.Entries))
|
||||
for _, entry := range cfg.PeerCache.Entries {
|
||||
if strings.TrimSpace(entry.NodeID) == "" || strings.TrimSpace(entry.Endpoint) == "" {
|
||||
continue
|
||||
}
|
||||
connection := connectionByNode[entry.NodeID]
|
||||
if connection.State == "" {
|
||||
connection.State = PeerConnectionDisconnected
|
||||
}
|
||||
if connection.State == PeerConnectionBackoff && connection.BackoffUntil.After(now) {
|
||||
continue
|
||||
}
|
||||
reason, ok := peerRecoveryCandidateReason(mode, entry, connection)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
candidate := PeerRecoveryCandidate{
|
||||
NodeID: entry.NodeID,
|
||||
Endpoint: strings.TrimSpace(entry.Endpoint),
|
||||
Warm: entry.Warm,
|
||||
WarmReason: entry.WarmReason,
|
||||
RecoverySeed: entry.RecoverySeed,
|
||||
BestCandidateID: entry.BestCandidateID,
|
||||
BestTransport: entry.BestTransport,
|
||||
ConnectionState: connection.State,
|
||||
ConsecutiveFailures: connection.ConsecutiveFailures,
|
||||
LastLatencyMs: connection.LastLatencyMs,
|
||||
BackoffUntil: connection.BackoffUntil,
|
||||
Reason: reason,
|
||||
Priority: peerRecoveryCandidatePriority(entry, connection, reason),
|
||||
}
|
||||
candidates = append(candidates, peerRecoveryCandidateBuild{PeerRecoveryCandidate: candidate})
|
||||
}
|
||||
sort.SliceStable(candidates, func(i, j int) bool {
|
||||
if candidates[i].Priority != candidates[j].Priority {
|
||||
return candidates[i].Priority > candidates[j].Priority
|
||||
}
|
||||
return candidates[i].NodeID < candidates[j].NodeID
|
||||
})
|
||||
if len(candidates) > limit {
|
||||
candidates = candidates[:limit]
|
||||
}
|
||||
|
||||
outCandidates := make([]PeerRecoveryCandidate, 0, len(candidates))
|
||||
recoverySeedCandidates := 0
|
||||
for _, candidate := range candidates {
|
||||
outCandidates = append(outCandidates, candidate.PeerRecoveryCandidate)
|
||||
if candidate.RecoverySeed {
|
||||
recoverySeedCandidates++
|
||||
}
|
||||
}
|
||||
|
||||
return PeerRecoveryPlan{
|
||||
Mode: mode,
|
||||
Healthy: deficit == 0,
|
||||
TargetReadyPeers: target,
|
||||
ReadyPeerCount: ready,
|
||||
DegradedPeerCount: degraded,
|
||||
BackoffPeerCount: backoff,
|
||||
ConnectablePeerCount: connectable,
|
||||
Deficit: deficit,
|
||||
ProbeCandidateCount: len(outCandidates),
|
||||
RecoverySeedCandidateCount: recoverySeedCandidates,
|
||||
GeneratedAt: now,
|
||||
Candidates: outCandidates,
|
||||
}
|
||||
}
|
||||
|
||||
func peerRecoveryCandidateReason(mode string, entry PeerCacheEntry, connection PeerConnectionState) (string, bool) {
|
||||
if mode == PeerRecoveryModeSteady {
|
||||
if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady {
|
||||
return "maintain_ready", true
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady {
|
||||
return "maintain_ready", true
|
||||
}
|
||||
if connection.State == PeerConnectionDegraded {
|
||||
return "recover_degraded", true
|
||||
}
|
||||
if entry.Warm {
|
||||
return "recover_warm", true
|
||||
}
|
||||
if entry.RecoverySeed {
|
||||
return "recover_seed", true
|
||||
}
|
||||
return "recover_peer", true
|
||||
}
|
||||
|
||||
func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnectionState, reason string) int {
|
||||
score := 0
|
||||
if entry.Warm {
|
||||
score += 1000
|
||||
}
|
||||
switch entry.WarmReason {
|
||||
case "route_adjacent":
|
||||
score += 500
|
||||
case "recovery_seed":
|
||||
score += 350
|
||||
case "endpoint_candidate":
|
||||
score += 200
|
||||
case "peer_endpoint":
|
||||
score += 100
|
||||
}
|
||||
if entry.RecoverySeed {
|
||||
score += 250
|
||||
}
|
||||
if entry.BestCandidateID != "" {
|
||||
score += 150
|
||||
}
|
||||
score += entry.BestCandidateScore / 10
|
||||
switch connection.State {
|
||||
case PeerConnectionReady, PeerConnectionRelayReady:
|
||||
score += 600
|
||||
case PeerConnectionDegraded:
|
||||
score += 350
|
||||
case PeerConnectionConnecting:
|
||||
score += 200
|
||||
case PeerConnectionDisconnected:
|
||||
score += 100
|
||||
}
|
||||
switch reason {
|
||||
case "maintain_ready":
|
||||
score += 500
|
||||
case "recover_degraded":
|
||||
score += 300
|
||||
case "recover_seed":
|
||||
score += 250
|
||||
case "recover_warm":
|
||||
score += 150
|
||||
}
|
||||
if connection.LastLatencyMs > 0 {
|
||||
score -= connection.LastLatencyMs / 10
|
||||
}
|
||||
if score < 0 {
|
||||
return 0
|
||||
}
|
||||
return score
|
||||
}
|
||||
|
||||
func connectablePeerCount(snapshot PeerCacheSnapshot) int {
|
||||
count := 0
|
||||
for _, entry := range snapshot.Entries {
|
||||
if strings.TrimSpace(entry.NodeID) == "" || strings.TrimSpace(entry.Endpoint) == "" {
|
||||
continue
|
||||
}
|
||||
count++
|
||||
}
|
||||
return count
|
||||
}
|
||||
@@ -0,0 +1,139 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestPeerRecoveryPlanMaintainsBoundedReadyPeers(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
|
||||
PeerCache: PeerCacheSnapshot{
|
||||
Entries: []PeerCacheEntry{
|
||||
recoveryPlanPeer("node-a", true, false, "route_adjacent"),
|
||||
recoveryPlanPeer("node-b", true, false, "route_adjacent"),
|
||||
recoveryPlanPeer("node-c", true, false, "peer_endpoint"),
|
||||
recoveryPlanPeer("node-d", true, false, "peer_endpoint"),
|
||||
},
|
||||
},
|
||||
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
|
||||
{NodeID: "node-a", State: PeerConnectionReady, LastLatencyMs: 40},
|
||||
{NodeID: "node-b", State: PeerConnectionReady, LastLatencyMs: 20},
|
||||
{NodeID: "node-c", State: PeerConnectionReady, LastLatencyMs: 30},
|
||||
{NodeID: "node-d", State: PeerConnectionReady, LastLatencyMs: 10},
|
||||
}},
|
||||
Now: now,
|
||||
})
|
||||
|
||||
if plan.Mode != PeerRecoveryModeSteady || !plan.Healthy {
|
||||
t.Fatalf("unexpected plan health: %+v", plan)
|
||||
}
|
||||
if plan.TargetReadyPeers != DefaultStablePeerTarget || len(plan.Candidates) != DefaultStablePeerTarget {
|
||||
t.Fatalf("unexpected bounded candidates: %+v", plan)
|
||||
}
|
||||
for _, candidate := range plan.Candidates {
|
||||
if candidate.Reason != "maintain_ready" {
|
||||
t.Fatalf("unexpected candidate reason: %+v", candidate)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerRecoveryPlanAddsRecoverySeedWhenReadyDeficit(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
|
||||
PeerCache: PeerCacheSnapshot{
|
||||
Entries: []PeerCacheEntry{
|
||||
recoveryPlanPeer("node-a", true, false, "route_adjacent"),
|
||||
recoveryPlanPeer("node-b", true, false, "route_adjacent"),
|
||||
recoveryPlanPeer("node-seed", false, true, ""),
|
||||
},
|
||||
},
|
||||
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
|
||||
{NodeID: "node-a", State: PeerConnectionReady, LastLatencyMs: 20},
|
||||
{NodeID: "node-b", State: PeerConnectionBackoff, BackoffUntil: now.Add(time.Minute)},
|
||||
}},
|
||||
Now: now,
|
||||
})
|
||||
|
||||
if plan.Mode != PeerRecoveryModeRecovery || plan.Healthy {
|
||||
t.Fatalf("unexpected recovery mode: %+v", plan)
|
||||
}
|
||||
if plan.Deficit != 2 || plan.RecoverySeedCandidateCount != 1 {
|
||||
t.Fatalf("unexpected deficit/seed count: %+v", plan)
|
||||
}
|
||||
if !recoveryPlanHasCandidate(plan, "node-seed", "recover_seed") {
|
||||
t.Fatalf("recovery seed was not selected: %+v", plan.Candidates)
|
||||
}
|
||||
if recoveryPlanHasCandidate(plan, "node-b", "") {
|
||||
t.Fatalf("active backoff peer should not be selected: %+v", plan.Candidates)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerRecoveryPlanMaintainsRelayReadyPeersInSteadyMode(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
|
||||
PeerCache: PeerCacheSnapshot{
|
||||
Entries: []PeerCacheEntry{
|
||||
{
|
||||
NodeID: "node-c",
|
||||
Endpoint: "http://relay:19001",
|
||||
Warm: true,
|
||||
WarmReason: "rendezvous_lease",
|
||||
RendezvousLeaseID: "lease-1",
|
||||
RelayNodeID: "node-r",
|
||||
RelayEndpoint: "http://relay:19001",
|
||||
RelayControl: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
|
||||
{NodeID: "node-c", State: PeerConnectionRelayReady, LastLatencyMs: 15},
|
||||
}},
|
||||
Now: now,
|
||||
})
|
||||
|
||||
if plan.Mode != PeerRecoveryModeSteady || !plan.Healthy {
|
||||
t.Fatalf("unexpected steady plan: %+v", plan)
|
||||
}
|
||||
if !recoveryPlanHasCandidate(plan, "node-c", "maintain_ready") {
|
||||
t.Fatalf("relay-ready peer was not maintained: %+v", plan.Candidates)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPeerRecoveryPlanCapsTargetByConnectablePeers(t *testing.T) {
|
||||
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
|
||||
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
|
||||
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
|
||||
{NodeID: "node-a", Warm: true, WarmReason: "route_adjacent"},
|
||||
recoveryPlanPeer("node-b", true, false, "route_adjacent"),
|
||||
}},
|
||||
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
|
||||
{NodeID: "node-b", State: PeerConnectionReady},
|
||||
}},
|
||||
Now: now,
|
||||
})
|
||||
|
||||
if plan.TargetReadyPeers != 1 || !plan.Healthy {
|
||||
t.Fatalf("target should be capped by connectable peers: %+v", plan)
|
||||
}
|
||||
}
|
||||
|
||||
func recoveryPlanPeer(nodeID string, warm bool, recoverySeed bool, warmReason string) PeerCacheEntry {
|
||||
return PeerCacheEntry{
|
||||
NodeID: nodeID,
|
||||
Endpoint: "http://" + nodeID + ":19001",
|
||||
Warm: warm,
|
||||
WarmReason: warmReason,
|
||||
RecoverySeed: recoverySeed,
|
||||
}
|
||||
}
|
||||
|
||||
func recoveryPlanHasCandidate(plan PeerRecoveryPlan, nodeID string, reason string) bool {
|
||||
for _, candidate := range plan.Candidates {
|
||||
if candidate.NodeID != nodeID {
|
||||
continue
|
||||
}
|
||||
return reason == "" || candidate.Reason == reason
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -0,0 +1,149 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
func ValidateProductionEnvelope(local PeerIdentity, envelope ProductionEnvelope, now time.Time) error {
|
||||
if envelope.FabricProtocolVersion != ProtocolVersion {
|
||||
return fmt.Errorf("%w: unsupported fabric_protocol_version", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if envelope.MessageID == "" {
|
||||
return fmt.Errorf("%w: message_id is required", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if envelope.RouteID == "" {
|
||||
return fmt.Errorf("%w: route_id is required", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if envelope.ClusterID == "" || envelope.ClusterID != local.ClusterID {
|
||||
return ErrClusterMismatch
|
||||
}
|
||||
if envelope.SourceNodeID == "" || envelope.DestinationNodeID == "" {
|
||||
return fmt.Errorf("%w: source_node_id and destination_node_id are required", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if envelope.CurrentHopNodeID != local.NodeID {
|
||||
return ErrNodeMismatch
|
||||
}
|
||||
if envelope.NextHopNodeID == "" {
|
||||
return fmt.Errorf("%w: next_hop_node_id is required", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if len(envelope.RoutePath) > 0 {
|
||||
if err := validateProductionRoutePath(local, envelope); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if envelope.ChannelClass != ProductionChannelFabricControl {
|
||||
return ErrUnauthorizedChannel
|
||||
}
|
||||
if envelope.MessageType != ProductionMessageFabricControl {
|
||||
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if envelope.TTL <= 0 {
|
||||
return ErrTTLExhausted
|
||||
}
|
||||
if envelope.HopCount < 0 {
|
||||
return fmt.Errorf("%w: hop_count must not be negative", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if envelope.CreatedAt.IsZero() || envelope.ExpiresAt.IsZero() {
|
||||
return fmt.Errorf("%w: created_at and expires_at are required", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if envelope.CreatedAt.After(now.UTC().Add(MaxProductionEnvelopeFutureSkew)) {
|
||||
return fmt.Errorf("%w: created_at exceeds allowed future skew", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if !envelope.ExpiresAt.After(now.UTC()) {
|
||||
return ErrRouteExpired
|
||||
}
|
||||
if envelope.PayloadLength != len(envelope.Payload) {
|
||||
return fmt.Errorf("%w: payload_length mismatch", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if envelope.PayloadLength > MaxProductionEnvelopePayloadBytes {
|
||||
return fmt.Errorf("%w: payload exceeds fabric-control limit", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if envelope.PayloadHash == "" {
|
||||
return fmt.Errorf("%w: payload_hash is required", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
sum := sha256.Sum256(envelope.Payload)
|
||||
if envelope.PayloadHash != hex.EncodeToString(sum[:]) {
|
||||
return fmt.Errorf("%w: payload_hash mismatch", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func validateProductionRoutePath(local PeerIdentity, envelope ProductionEnvelope) error {
|
||||
if len(envelope.RoutePath) < 2 {
|
||||
return ErrInvalidRoutePath
|
||||
}
|
||||
if envelope.RoutePath[0] != envelope.SourceNodeID || envelope.RoutePath[len(envelope.RoutePath)-1] != envelope.DestinationNodeID {
|
||||
return ErrInvalidRoutePath
|
||||
}
|
||||
currentIndex := -1
|
||||
seen := map[string]struct{}{}
|
||||
for index, nodeID := range envelope.RoutePath {
|
||||
if nodeID == "" {
|
||||
return ErrInvalidRoutePath
|
||||
}
|
||||
if _, duplicate := seen[nodeID]; duplicate {
|
||||
return ErrLoopDetected
|
||||
}
|
||||
seen[nodeID] = struct{}{}
|
||||
if nodeID == local.NodeID {
|
||||
currentIndex = index
|
||||
}
|
||||
}
|
||||
if currentIndex < 0 || envelope.CurrentHopNodeID != local.NodeID {
|
||||
return ErrNodeMismatch
|
||||
}
|
||||
if containsProductionNodeID(envelope.VisitedNodeIDs, local.NodeID) {
|
||||
return ErrLoopDetected
|
||||
}
|
||||
for _, visitedNodeID := range envelope.VisitedNodeIDs {
|
||||
if visitedNodeID == "" || !containsProductionNodeID(envelope.RoutePath, visitedNodeID) {
|
||||
return ErrInvalidRoutePath
|
||||
}
|
||||
}
|
||||
if envelope.DestinationNodeID == local.NodeID {
|
||||
if envelope.NextHopNodeID != local.NodeID {
|
||||
return ErrInvalidRoutePath
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if currentIndex >= len(envelope.RoutePath)-1 {
|
||||
return ErrInvalidRoutePath
|
||||
}
|
||||
if envelope.NextHopNodeID != envelope.RoutePath[currentIndex+1] {
|
||||
return ErrInvalidRoutePath
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func containsProductionNodeID(values []string, needle string) bool {
|
||||
for _, value := range values {
|
||||
if value == needle {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func NewProductionEnvelopeObservation(envelope ProductionEnvelope, observedAt time.Time) ProductionEnvelopeObservation {
|
||||
return ProductionEnvelopeObservation{
|
||||
MessageID: envelope.MessageID,
|
||||
RouteID: envelope.RouteID,
|
||||
ClusterID: envelope.ClusterID,
|
||||
SourceNodeID: envelope.SourceNodeID,
|
||||
DestinationNodeID: envelope.DestinationNodeID,
|
||||
CurrentHopNodeID: envelope.CurrentHopNodeID,
|
||||
NextHopNodeID: envelope.NextHopNodeID,
|
||||
RoutePath: append([]string{}, envelope.RoutePath...),
|
||||
VisitedNodeIDs: append([]string{}, envelope.VisitedNodeIDs...),
|
||||
ChannelClass: envelope.ChannelClass,
|
||||
MessageType: envelope.MessageType,
|
||||
TTL: envelope.TTL,
|
||||
HopCount: envelope.HopCount,
|
||||
PayloadLength: envelope.PayloadLength,
|
||||
PayloadHash: envelope.PayloadHash,
|
||||
ObservedAt: observedAt.UTC(),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type ProductionEnvelopeObservationSink struct {
|
||||
mu sync.Mutex
|
||||
capacity int
|
||||
items []ProductionEnvelopeObservation
|
||||
accepted uint64
|
||||
dropped uint64
|
||||
}
|
||||
|
||||
type ProductionEnvelopeObservationSinkMetrics struct {
|
||||
Capacity int `json:"capacity"`
|
||||
CurrentDepth int `json:"current_depth"`
|
||||
AcceptedTotal uint64 `json:"accepted_total"`
|
||||
DroppedOldest uint64 `json:"dropped_oldest"`
|
||||
}
|
||||
|
||||
func NewProductionEnvelopeObservationSink(capacity int) *ProductionEnvelopeObservationSink {
|
||||
if capacity < 1 {
|
||||
capacity = 1
|
||||
}
|
||||
return &ProductionEnvelopeObservationSink{
|
||||
capacity: capacity,
|
||||
items: make([]ProductionEnvelopeObservation, 0, capacity),
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ProductionEnvelopeObservationSink) Observe(_ context.Context, observation ProductionEnvelopeObservation) error {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
s.accepted++
|
||||
if len(s.items) == s.capacity {
|
||||
copy(s.items, s.items[1:])
|
||||
s.items[len(s.items)-1] = observation
|
||||
s.dropped++
|
||||
return nil
|
||||
}
|
||||
s.items = append(s.items, observation)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *ProductionEnvelopeObservationSink) Snapshot() []ProductionEnvelopeObservation {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
out := make([]ProductionEnvelopeObservation, len(s.items))
|
||||
copy(out, s.items)
|
||||
return out
|
||||
}
|
||||
|
||||
func (s *ProductionEnvelopeObservationSink) Len() int {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
return len(s.items)
|
||||
}
|
||||
|
||||
func (s *ProductionEnvelopeObservationSink) Capacity() int {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
return s.capacity
|
||||
}
|
||||
|
||||
func (s *ProductionEnvelopeObservationSink) Metrics() ProductionEnvelopeObservationSinkMetrics {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
return ProductionEnvelopeObservationSinkMetrics{
|
||||
Capacity: s.capacity,
|
||||
CurrentDepth: len(s.items),
|
||||
AcceptedTotal: s.accepted,
|
||||
DroppedOldest: s.dropped,
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,80 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
func ValidateProductionEnvelopeRouteConfig(local PeerIdentity, envelope ProductionEnvelope, routes []SyntheticRoute, now time.Time) error {
|
||||
if len(routes) == 0 {
|
||||
return nil
|
||||
}
|
||||
route, ok := productionRouteByID(routes, envelope.RouteID)
|
||||
if !ok {
|
||||
return ErrRouteNotFound
|
||||
}
|
||||
if route.ClusterID != envelope.ClusterID || route.ClusterID != local.ClusterID {
|
||||
return ErrClusterMismatch
|
||||
}
|
||||
if route.SourceNodeID != envelope.SourceNodeID || route.DestinationNodeID != envelope.DestinationNodeID {
|
||||
return ErrInvalidRoutePath
|
||||
}
|
||||
if route.ExpiresAt.IsZero() || !route.ExpiresAt.After(now.UTC()) || envelope.ExpiresAt.After(route.ExpiresAt) {
|
||||
return ErrRouteExpired
|
||||
}
|
||||
if !contains(route.AllowedChannels, ProductionChannelFabricControl) {
|
||||
return ErrUnauthorizedChannel
|
||||
}
|
||||
path := routePath(route)
|
||||
if len(path) < 2 || path[0] != route.SourceNodeID || path[len(path)-1] != route.DestinationNodeID {
|
||||
return ErrInvalidRoutePath
|
||||
}
|
||||
if len(envelope.RoutePath) > 0 && !sameNodePath(envelope.RoutePath, path) {
|
||||
return ErrInvalidRoutePath
|
||||
}
|
||||
if len(path) > 2 && len(envelope.RoutePath) == 0 {
|
||||
return ErrInvalidRoutePath
|
||||
}
|
||||
currentIndex := indexOf(path, local.NodeID)
|
||||
if currentIndex < 0 || envelope.CurrentHopNodeID != local.NodeID {
|
||||
return ErrNodeMismatch
|
||||
}
|
||||
expectedNextHop := local.NodeID
|
||||
if local.NodeID != envelope.DestinationNodeID {
|
||||
if currentIndex >= len(path)-1 {
|
||||
return ErrInvalidRoutePath
|
||||
}
|
||||
expectedNextHop = path[currentIndex+1]
|
||||
}
|
||||
if envelope.NextHopNodeID != expectedNextHop {
|
||||
return ErrInvalidRoutePath
|
||||
}
|
||||
if route.MaxTTL > 0 && envelope.TTL > route.MaxTTL {
|
||||
return fmt.Errorf("%w: ttl exceeds configured route max_ttl", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
if route.MaxHops > 0 && envelope.HopCount > route.MaxHops {
|
||||
return fmt.Errorf("%w: hop_count exceeds configured route max_hops", ErrForwardEnvelopeInvalid)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func productionRouteByID(routes []SyntheticRoute, routeID string) (SyntheticRoute, bool) {
|
||||
for _, route := range routes {
|
||||
if route.RouteID == routeID {
|
||||
return route, true
|
||||
}
|
||||
}
|
||||
return SyntheticRoute{}, false
|
||||
}
|
||||
|
||||
func sameNodePath(a []string, b []string) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
for i := range a {
|
||||
if a[i] != b[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type ProductionForwardTransport interface {
|
||||
SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error)
|
||||
}
|
||||
|
||||
type HTTPProductionForwardTransport struct {
|
||||
PeerURLs map[string]string
|
||||
HTTPClient *http.Client
|
||||
}
|
||||
|
||||
func NewHTTPProductionForwardTransport(peerURLs map[string]string) *HTTPProductionForwardTransport {
|
||||
normalized := make(map[string]string, len(peerURLs))
|
||||
for nodeID, baseURL := range peerURLs {
|
||||
nodeID = strings.TrimSpace(nodeID)
|
||||
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
|
||||
if nodeID != "" && baseURL != "" {
|
||||
normalized[nodeID] = baseURL
|
||||
}
|
||||
}
|
||||
return &HTTPProductionForwardTransport{PeerURLs: normalized}
|
||||
}
|
||||
|
||||
func (t *HTTPProductionForwardTransport) SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error) {
|
||||
if t == nil {
|
||||
return ProductionForwardResult{}, ErrForwardPeerUnavailable
|
||||
}
|
||||
baseURL := strings.TrimRight(strings.TrimSpace(t.PeerURLs[nextNodeID]), "/")
|
||||
if baseURL == "" {
|
||||
return ProductionForwardResult{}, ErrForwardPeerUnavailable
|
||||
}
|
||||
client := NewClient(baseURL)
|
||||
if t.HTTPClient != nil {
|
||||
client.HTTPClient = t.HTTPClient
|
||||
}
|
||||
return client.SendProduction(ctx, envelope)
|
||||
}
|
||||
@@ -0,0 +1,241 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type ScopedSyntheticConfig struct {
|
||||
SchemaVersion string `json:"schema_version"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
LocalNodeID string `json:"local_node_id"`
|
||||
ConfigVersion string `json:"config_version,omitempty"`
|
||||
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
|
||||
PolicyVersion string `json:"policy_version,omitempty"`
|
||||
PeerEndpoints map[string]string `json:"peer_endpoints"`
|
||||
PeerEndpointCandidates map[string][]PeerEndpointCandidate `json:"peer_endpoint_candidates,omitempty"`
|
||||
PeerDirectory []PeerDirectoryEntry `json:"peer_directory,omitempty"`
|
||||
RecoverySeeds []PeerRecoverySeed `json:"recovery_seeds,omitempty"`
|
||||
RendezvousLeases []PeerRendezvousLease `json:"rendezvous_leases,omitempty"`
|
||||
Routes []SyntheticRoute `json:"routes"`
|
||||
}
|
||||
|
||||
type PeerDirectoryEntry struct {
|
||||
NodeID string `json:"node_id"`
|
||||
RouteIDs []string `json:"route_ids,omitempty"`
|
||||
EndpointCount int `json:"endpoint_count"`
|
||||
CandidateCount int `json:"candidate_count"`
|
||||
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
|
||||
RecoverySeed bool `json:"recovery_seed"`
|
||||
}
|
||||
|
||||
type PeerRecoverySeed struct {
|
||||
NodeID string `json:"node_id"`
|
||||
Endpoint string `json:"endpoint"`
|
||||
Transport string `json:"transport"`
|
||||
ConnectivityMode string `json:"connectivity_mode,omitempty"`
|
||||
Region string `json:"region,omitempty"`
|
||||
Priority int `json:"priority"`
|
||||
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
|
||||
Metadata json.RawMessage `json:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
type PeerRendezvousLease struct {
|
||||
LeaseID string `json:"lease_id"`
|
||||
PeerNodeID string `json:"peer_node_id"`
|
||||
RelayNodeID string `json:"relay_node_id"`
|
||||
RelayEndpoint string `json:"relay_endpoint"`
|
||||
Transport string `json:"transport"`
|
||||
ConnectivityMode string `json:"connectivity_mode,omitempty"`
|
||||
RouteIDs []string `json:"route_ids,omitempty"`
|
||||
AllowedChannels []string `json:"allowed_channels,omitempty"`
|
||||
Priority int `json:"priority"`
|
||||
ControlPlaneOnly bool `json:"control_plane_only"`
|
||||
IssuedAt time.Time `json:"issued_at"`
|
||||
ExpiresAt time.Time `json:"expires_at"`
|
||||
Reason string `json:"reason,omitempty"`
|
||||
Metadata json.RawMessage `json:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
type PeerEndpointCandidate struct {
|
||||
EndpointID string `json:"endpoint_id"`
|
||||
NodeID string `json:"node_id"`
|
||||
Transport string `json:"transport"`
|
||||
Address string `json:"address"`
|
||||
AddressFamily string `json:"address_family,omitempty"`
|
||||
Reachability string `json:"reachability"`
|
||||
NATType string `json:"nat_type,omitempty"`
|
||||
ConnectivityMode string `json:"connectivity_mode"`
|
||||
Region string `json:"region,omitempty"`
|
||||
Priority int `json:"priority"`
|
||||
PolicyTags []string `json:"policy_tags,omitempty"`
|
||||
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
|
||||
Metadata json.RawMessage `json:"metadata,omitempty"`
|
||||
}
|
||||
|
||||
func LoadScopedSyntheticConfig(path string, local PeerIdentity) (ScopedSyntheticConfig, error) {
|
||||
payload, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return ScopedSyntheticConfig{}, err
|
||||
}
|
||||
var cfg ScopedSyntheticConfig
|
||||
if err := json.Unmarshal(payload, &cfg); err != nil {
|
||||
return ScopedSyntheticConfig{}, fmt.Errorf("parse scoped synthetic mesh config: %w", err)
|
||||
}
|
||||
if err := cfg.Validate(local); err != nil {
|
||||
return ScopedSyntheticConfig{}, err
|
||||
}
|
||||
return cfg, nil
|
||||
}
|
||||
|
||||
func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error {
|
||||
if cfg.SchemaVersion == "" {
|
||||
return fmt.Errorf("scoped synthetic mesh config schema_version is required")
|
||||
}
|
||||
if cfg.ClusterID == "" || cfg.ClusterID != local.ClusterID {
|
||||
return ErrClusterMismatch
|
||||
}
|
||||
if cfg.LocalNodeID == "" || cfg.LocalNodeID != local.NodeID {
|
||||
return ErrNodeMismatch
|
||||
}
|
||||
for nodeID, endpoint := range cfg.PeerEndpoints {
|
||||
if strings.TrimSpace(nodeID) == "" || strings.TrimSpace(endpoint) == "" {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains empty peer endpoint")
|
||||
}
|
||||
}
|
||||
for nodeID, candidates := range cfg.PeerEndpointCandidates {
|
||||
if strings.TrimSpace(nodeID) == "" {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains empty peer endpoint candidate node")
|
||||
}
|
||||
for _, candidate := range candidates {
|
||||
if strings.TrimSpace(candidate.EndpointID) == "" ||
|
||||
strings.TrimSpace(candidate.NodeID) == "" ||
|
||||
candidate.NodeID != nodeID ||
|
||||
strings.TrimSpace(candidate.Transport) == "" ||
|
||||
strings.TrimSpace(candidate.Address) == "" ||
|
||||
strings.TrimSpace(candidate.Reachability) == "" ||
|
||||
strings.TrimSpace(candidate.ConnectivityMode) == "" {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains invalid peer endpoint candidate")
|
||||
}
|
||||
}
|
||||
}
|
||||
if err := validatePeerDirectory(cfg.PeerDirectory, cfg.LocalNodeID); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := validateRecoverySeeds(cfg.RecoverySeeds); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := validateRendezvousLeases(cfg.RendezvousLeases, cfg.Routes, cfg.LocalNodeID); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, route := range cfg.Routes {
|
||||
if route.ClusterID != cfg.ClusterID {
|
||||
return ErrClusterMismatch
|
||||
}
|
||||
path := routePath(route)
|
||||
if len(path) < 2 {
|
||||
return ErrInvalidRoutePath
|
||||
}
|
||||
if !contains(path, cfg.LocalNodeID) {
|
||||
return ErrNodeMismatch
|
||||
}
|
||||
if route.ExpiresAt.IsZero() {
|
||||
return fmt.Errorf("scoped synthetic route %q expires_at is required", route.RouteID)
|
||||
}
|
||||
if !route.ExpiresAt.After(time.Now().UTC()) {
|
||||
return ErrRouteExpired
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func validatePeerDirectory(entries []PeerDirectoryEntry, localNodeID string) error {
|
||||
seen := map[string]struct{}{}
|
||||
for _, entry := range entries {
|
||||
nodeID := strings.TrimSpace(entry.NodeID)
|
||||
if nodeID == "" || nodeID == localNodeID {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains invalid peer directory entry")
|
||||
}
|
||||
if _, duplicate := seen[nodeID]; duplicate {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains duplicate peer directory entry")
|
||||
}
|
||||
seen[nodeID] = struct{}{}
|
||||
if entry.EndpointCount < 0 || entry.CandidateCount < 0 {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains invalid peer directory count")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func validateRecoverySeeds(seeds []PeerRecoverySeed) error {
|
||||
if len(seeds) > 20 {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains too many recovery seeds")
|
||||
}
|
||||
seen := map[string]struct{}{}
|
||||
for _, seed := range seeds {
|
||||
key := strings.TrimSpace(seed.NodeID) + "\x00" + strings.TrimSpace(seed.Endpoint)
|
||||
if strings.TrimSpace(seed.NodeID) == "" ||
|
||||
strings.TrimSpace(seed.Endpoint) == "" ||
|
||||
strings.TrimSpace(seed.Transport) == "" {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains invalid recovery seed")
|
||||
}
|
||||
if _, duplicate := seen[key]; duplicate {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains duplicate recovery seed")
|
||||
}
|
||||
seen[key] = struct{}{}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func validateRendezvousLeases(leases []PeerRendezvousLease, routes []SyntheticRoute, localNodeID string) error {
|
||||
if len(leases) > 20 {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains too many rendezvous leases")
|
||||
}
|
||||
routesByID := map[string]SyntheticRoute{}
|
||||
for _, route := range routes {
|
||||
if strings.TrimSpace(route.RouteID) != "" {
|
||||
routesByID[route.RouteID] = route
|
||||
}
|
||||
}
|
||||
seen := map[string]struct{}{}
|
||||
now := time.Now().UTC()
|
||||
for _, lease := range leases {
|
||||
if strings.TrimSpace(lease.LeaseID) == "" ||
|
||||
strings.TrimSpace(lease.PeerNodeID) == "" ||
|
||||
strings.TrimSpace(lease.RelayNodeID) == "" ||
|
||||
strings.TrimSpace(lease.RelayEndpoint) == "" ||
|
||||
strings.TrimSpace(lease.Transport) == "" ||
|
||||
lease.PeerNodeID == lease.RelayNodeID ||
|
||||
!lease.ControlPlaneOnly ||
|
||||
lease.ExpiresAt.IsZero() ||
|
||||
!lease.ExpiresAt.After(now) ||
|
||||
(len(lease.Metadata) > 0 && !json.Valid(lease.Metadata)) {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains invalid rendezvous lease")
|
||||
}
|
||||
if _, duplicate := seen[lease.LeaseID]; duplicate {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains duplicate rendezvous lease")
|
||||
}
|
||||
seen[lease.LeaseID] = struct{}{}
|
||||
if len(lease.RouteIDs) == 0 {
|
||||
continue
|
||||
}
|
||||
visible := false
|
||||
for _, routeID := range lease.RouteIDs {
|
||||
route, ok := routesByID[routeID]
|
||||
if !ok {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains rendezvous lease for unknown route")
|
||||
}
|
||||
path := routePath(route)
|
||||
if contains(path, localNodeID) && contains(path, lease.PeerNodeID) && contains(path, lease.RelayNodeID) {
|
||||
visible = true
|
||||
}
|
||||
}
|
||||
if !visible {
|
||||
return fmt.Errorf("scoped synthetic mesh config contains out-of-scope rendezvous lease")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,235 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestLoadScopedSyntheticConfig(t *testing.T) {
|
||||
expiresAt := time.Now().UTC().Add(time.Hour)
|
||||
path := writeScopedConfig(t, ScopedSyntheticConfig{
|
||||
SchemaVersion: "c17f.synthetic.v1",
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
ConfigVersion: "config-v1",
|
||||
PeerDirectoryVersion: "peers-v1",
|
||||
PolicyVersion: "policy-v1",
|
||||
PeerEndpoints: map[string]string{"node-b": "http://127.0.0.1:19002"},
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
{
|
||||
EndpointID: "node-b-public",
|
||||
NodeID: "node-b",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.20:443",
|
||||
Reachability: "public",
|
||||
NATType: "restricted",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
PeerDirectory: []PeerDirectoryEntry{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
RouteIDs: []string{"route-a-b"},
|
||||
EndpointCount: 1,
|
||||
CandidateCount: 1,
|
||||
ConnectivityModes: []string{"direct"},
|
||||
RecoverySeed: true,
|
||||
},
|
||||
},
|
||||
RecoverySeeds: []PeerRecoverySeed{
|
||||
{
|
||||
NodeID: "node-b",
|
||||
Endpoint: "https://node-b.example.test:443",
|
||||
Transport: "direct_tcp_tls",
|
||||
ConnectivityMode: "direct",
|
||||
Priority: 10,
|
||||
},
|
||||
},
|
||||
RendezvousLeases: []PeerRendezvousLease{
|
||||
{
|
||||
LeaseID: "lease-node-b-via-node-r",
|
||||
PeerNodeID: "node-b",
|
||||
RelayNodeID: "node-r",
|
||||
RelayEndpoint: "http://node-r:19000",
|
||||
Transport: "relay_control",
|
||||
ConnectivityMode: "relay_required",
|
||||
RouteIDs: []string{"route-a-b"},
|
||||
AllowedChannels: []string{"fabric_control", "route_control"},
|
||||
Priority: 10,
|
||||
ControlPlaneOnly: true,
|
||||
IssuedAt: expiresAt.Add(-time.Minute),
|
||||
ExpiresAt: expiresAt,
|
||||
},
|
||||
},
|
||||
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-r", "node-b"})},
|
||||
})
|
||||
|
||||
cfg, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
if err != nil {
|
||||
t.Fatalf("load scoped config: %v", err)
|
||||
}
|
||||
if cfg.ConfigVersion != "config-v1" || cfg.PeerEndpoints["node-b"] == "" || len(cfg.Routes) != 1 {
|
||||
t.Fatalf("unexpected config: %+v", cfg)
|
||||
}
|
||||
if got := cfg.PeerEndpointCandidates["node-b"]; len(got) != 1 || got[0].EndpointID != "node-b-public" {
|
||||
t.Fatalf("unexpected endpoint candidates: %+v", cfg.PeerEndpointCandidates)
|
||||
}
|
||||
if len(cfg.PeerDirectory) != 1 || cfg.PeerDirectory[0].NodeID != "node-b" || !cfg.PeerDirectory[0].RecoverySeed {
|
||||
t.Fatalf("unexpected peer directory: %+v", cfg.PeerDirectory)
|
||||
}
|
||||
if len(cfg.RecoverySeeds) != 1 || cfg.RecoverySeeds[0].NodeID != "node-b" {
|
||||
t.Fatalf("unexpected recovery seeds: %+v", cfg.RecoverySeeds)
|
||||
}
|
||||
if len(cfg.RendezvousLeases) != 1 || cfg.RendezvousLeases[0].RelayNodeID != "node-r" {
|
||||
t.Fatalf("unexpected rendezvous leases: %+v", cfg.RendezvousLeases)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadScopedSyntheticConfigRejectsWrongCluster(t *testing.T) {
|
||||
path := writeScopedConfig(t, ScopedSyntheticConfig{
|
||||
SchemaVersion: "c17f.synthetic.v1",
|
||||
ClusterID: "cluster-2",
|
||||
LocalNodeID: "node-a",
|
||||
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
|
||||
})
|
||||
|
||||
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
if !errors.Is(err, ErrClusterMismatch) {
|
||||
t.Fatalf("err = %v, want ErrClusterMismatch", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadScopedSyntheticConfigRejectsWrongNode(t *testing.T) {
|
||||
path := writeScopedConfig(t, ScopedSyntheticConfig{
|
||||
SchemaVersion: "c17f.synthetic.v1",
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-x",
|
||||
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
|
||||
})
|
||||
|
||||
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
if !errors.Is(err, ErrNodeMismatch) {
|
||||
t.Fatalf("err = %v, want ErrNodeMismatch", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadScopedSyntheticConfigRejectsExpiredRoute(t *testing.T) {
|
||||
route := liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})
|
||||
route.ExpiresAt = time.Now().UTC().Add(-time.Minute)
|
||||
path := writeScopedConfig(t, ScopedSyntheticConfig{
|
||||
SchemaVersion: "c17f.synthetic.v1",
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
Routes: []SyntheticRoute{route},
|
||||
})
|
||||
|
||||
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
if !errors.Is(err, ErrRouteExpired) {
|
||||
t.Fatalf("err = %v, want ErrRouteExpired", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointCandidate(t *testing.T) {
|
||||
path := writeScopedConfig(t, ScopedSyntheticConfig{
|
||||
SchemaVersion: "c17f.synthetic.v1",
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
|
||||
"node-b": {
|
||||
{
|
||||
EndpointID: "node-b-public",
|
||||
NodeID: "node-c",
|
||||
Transport: "direct_tcp_tls",
|
||||
Address: "203.0.113.20:443",
|
||||
Reachability: "public",
|
||||
ConnectivityMode: "direct",
|
||||
},
|
||||
},
|
||||
},
|
||||
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
|
||||
})
|
||||
|
||||
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
if err == nil {
|
||||
t.Fatal("expected invalid peer endpoint candidate error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadScopedSyntheticConfigRejectsInvalidPeerDirectory(t *testing.T) {
|
||||
path := writeScopedConfig(t, ScopedSyntheticConfig{
|
||||
SchemaVersion: "c17f.synthetic.v1",
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
PeerDirectory: []PeerDirectoryEntry{
|
||||
{NodeID: "node-a"},
|
||||
},
|
||||
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
|
||||
})
|
||||
|
||||
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
if err == nil {
|
||||
t.Fatal("expected invalid peer directory error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadScopedSyntheticConfigRejectsInvalidRecoverySeed(t *testing.T) {
|
||||
path := writeScopedConfig(t, ScopedSyntheticConfig{
|
||||
SchemaVersion: "c17f.synthetic.v1",
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
RecoverySeeds: []PeerRecoverySeed{
|
||||
{NodeID: "node-b", Endpoint: "", Transport: "direct_tcp_tls"},
|
||||
},
|
||||
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
|
||||
})
|
||||
|
||||
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
if err == nil {
|
||||
t.Fatal("expected invalid recovery seed error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) {
|
||||
path := writeScopedConfig(t, ScopedSyntheticConfig{
|
||||
SchemaVersion: "c17z12.synthetic.v1",
|
||||
ClusterID: "cluster-1",
|
||||
LocalNodeID: "node-a",
|
||||
RendezvousLeases: []PeerRendezvousLease{
|
||||
{
|
||||
LeaseID: "lease-node-b-via-node-r",
|
||||
PeerNodeID: "node-b",
|
||||
RelayNodeID: "node-r",
|
||||
RelayEndpoint: "http://node-r:19000",
|
||||
Transport: "relay_control",
|
||||
RouteIDs: []string{"route-a-b"},
|
||||
ExpiresAt: time.Now().UTC().Add(time.Hour),
|
||||
},
|
||||
},
|
||||
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-r", "node-b"})},
|
||||
})
|
||||
|
||||
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
|
||||
if err == nil {
|
||||
t.Fatal("expected invalid rendezvous lease error")
|
||||
}
|
||||
}
|
||||
|
||||
func writeScopedConfig(t *testing.T, cfg ScopedSyntheticConfig) string {
|
||||
t.Helper()
|
||||
payload, err := json.Marshal(cfg)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal config: %v", err)
|
||||
}
|
||||
path := filepath.Join(t.TempDir(), "mesh-config.json")
|
||||
if err := os.WriteFile(path, payload, 0o600); err != nil {
|
||||
t.Fatalf("write config: %v", err)
|
||||
}
|
||||
return path
|
||||
}
|
||||
@@ -0,0 +1,291 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"time"
|
||||
)
|
||||
|
||||
type ProductionEnvelopeObserver func(context.Context, ProductionEnvelopeObservation) error
|
||||
type ProductionForwardLogger func(ProductionForwardLogEntry)
|
||||
|
||||
type Server struct {
|
||||
Local PeerIdentity
|
||||
SyntheticRuntime *SyntheticRuntime
|
||||
ProductionForwardingEnabled bool
|
||||
ProductionEnvelopeObserver ProductionEnvelopeObserver
|
||||
ProductionForwardTransport ProductionForwardTransport
|
||||
ProductionForwardLogger ProductionForwardLogger
|
||||
ProductionRoutes []SyntheticRoute
|
||||
}
|
||||
|
||||
func (s Server) Handler() http.Handler {
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/mesh/v1/health", s.handleHealth)
|
||||
mux.HandleFunc("/mesh/v1/forward", s.handleForward)
|
||||
mux.HandleFunc("/mesh/v1/synthetic/probe", s.handleSyntheticProbe)
|
||||
return mux
|
||||
}
|
||||
|
||||
func (s Server) handleHealth(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
w.WriteHeader(http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
var message HealthMessage
|
||||
if err := json.NewDecoder(r.Body).Decode(&message); err != nil {
|
||||
http.Error(w, "invalid health message", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
if message.ProtocolVersion != ProtocolVersion {
|
||||
http.Error(w, "unsupported mesh protocol version", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
if err := ValidatePeer(s.Local, message.From); err != nil {
|
||||
http.Error(w, err.Error(), http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
if message.To.NodeID != "" && message.To.NodeID != s.Local.NodeID {
|
||||
http.Error(w, ErrNodeMismatch.Error(), http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(HealthAck{
|
||||
ProtocolVersion: ProtocolVersion,
|
||||
Accepted: true,
|
||||
By: s.Local,
|
||||
})
|
||||
}
|
||||
|
||||
func (s Server) handleForward(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
w.WriteHeader(http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
if !s.ProductionForwardingEnabled {
|
||||
s.logProductionForward(ProductionForwardLogEntry{
|
||||
Event: "production_forward_rejected",
|
||||
ClusterID: s.Local.ClusterID,
|
||||
LocalNodeID: s.Local.NodeID,
|
||||
Reason: ErrForwardDisabled.Error(),
|
||||
StatusCode: http.StatusNotImplemented,
|
||||
OccurredAt: time.Now().UTC(),
|
||||
})
|
||||
http.Error(w, ErrForwardDisabled.Error(), http.StatusNotImplemented)
|
||||
return
|
||||
}
|
||||
var envelope ProductionEnvelope
|
||||
if err := json.NewDecoder(r.Body).Decode(&envelope); err != nil {
|
||||
s.logProductionForward(ProductionForwardLogEntry{
|
||||
Event: "production_forward_rejected",
|
||||
ClusterID: s.Local.ClusterID,
|
||||
LocalNodeID: s.Local.NodeID,
|
||||
Reason: "invalid production mesh envelope",
|
||||
StatusCode: http.StatusBadRequest,
|
||||
OccurredAt: time.Now().UTC(),
|
||||
})
|
||||
http.Error(w, "invalid production mesh envelope", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
if err := ValidateProductionEnvelope(s.Local, envelope, time.Now().UTC()); err != nil {
|
||||
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
|
||||
return
|
||||
}
|
||||
if err := ValidateProductionEnvelopeRouteConfig(s.Local, envelope, s.ProductionRoutes, time.Now().UTC()); err != nil {
|
||||
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
|
||||
return
|
||||
}
|
||||
s.logProductionForward(productionForwardLogEntry("production_forward_accepted", s.Local, envelope, "", 0))
|
||||
if s.ProductionEnvelopeObserver != nil {
|
||||
observation := NewProductionEnvelopeObservation(envelope, time.Now().UTC())
|
||||
if err := observeProductionEnvelope(r.Context(), s.ProductionEnvelopeObserver, observation); err != nil {
|
||||
s.logProductionForward(productionForwardLogEntry("production_forward_rejected", s.Local, envelope, ErrForwardObservationFailed.Error(), http.StatusInternalServerError))
|
||||
http.Error(w, ErrForwardObservationFailed.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
}
|
||||
if envelope.DestinationNodeID == s.Local.NodeID {
|
||||
s.logProductionForward(productionForwardLogEntry("production_forward_delivered", s.Local, envelope, "", http.StatusOK))
|
||||
writeProductionForwardResult(w, ProductionForwardResult{
|
||||
Accepted: true,
|
||||
Delivered: true,
|
||||
By: s.Local,
|
||||
MessageID: envelope.MessageID,
|
||||
RouteID: envelope.RouteID,
|
||||
})
|
||||
return
|
||||
}
|
||||
if envelope.NextHopNodeID == s.Local.NodeID {
|
||||
s.rejectProductionForward(w, envelope, ErrLoopDetected, forwardStatusCode(ErrLoopDetected))
|
||||
return
|
||||
}
|
||||
if len(envelope.RoutePath) == 0 && envelope.NextHopNodeID != envelope.DestinationNodeID {
|
||||
s.rejectProductionForward(w, envelope, ErrForwardRuntimeUnavailable, http.StatusNotImplemented)
|
||||
return
|
||||
}
|
||||
if s.ProductionForwardTransport == nil {
|
||||
s.rejectProductionForward(w, envelope, ErrForwardRuntimeUnavailable, http.StatusNotImplemented)
|
||||
return
|
||||
}
|
||||
if envelope.TTL <= 1 {
|
||||
s.rejectProductionForward(w, envelope, ErrTTLExhausted, forwardStatusCode(ErrTTLExhausted))
|
||||
return
|
||||
}
|
||||
forwarded := envelope
|
||||
forwarded.CurrentHopNodeID = envelope.NextHopNodeID
|
||||
forwarded.NextHopNodeID = nextProductionHopAfter(envelope.RoutePath, envelope.NextHopNodeID, envelope.DestinationNodeID)
|
||||
forwarded.TTL = envelope.TTL - 1
|
||||
forwarded.HopCount = envelope.HopCount + 1
|
||||
forwarded.VisitedNodeIDs = append(append([]string{}, envelope.VisitedNodeIDs...), s.Local.NodeID)
|
||||
result, err := s.ProductionForwardTransport.SendProduction(r.Context(), envelope.NextHopNodeID, forwarded)
|
||||
if err != nil {
|
||||
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
|
||||
return
|
||||
}
|
||||
s.logProductionForward(productionForwardLogEntry("production_forward_forwarded", s.Local, envelope, "", http.StatusOK))
|
||||
result.Accepted = true
|
||||
result.Forwarded = true
|
||||
result.By = s.Local
|
||||
result.MessageID = envelope.MessageID
|
||||
result.RouteID = envelope.RouteID
|
||||
result.NextNodeID = envelope.NextHopNodeID
|
||||
writeProductionForwardResult(w, result)
|
||||
}
|
||||
|
||||
func (s Server) rejectProductionForward(w http.ResponseWriter, envelope ProductionEnvelope, err error, statusCode int) {
|
||||
s.logProductionForward(productionForwardLogEntry("production_forward_rejected", s.Local, envelope, err.Error(), statusCode))
|
||||
http.Error(w, err.Error(), statusCode)
|
||||
}
|
||||
|
||||
func (s Server) logProductionForward(entry ProductionForwardLogEntry) {
|
||||
if s.ProductionForwardLogger == nil {
|
||||
return
|
||||
}
|
||||
if entry.OccurredAt.IsZero() {
|
||||
entry.OccurredAt = time.Now().UTC()
|
||||
}
|
||||
s.ProductionForwardLogger(entry)
|
||||
}
|
||||
|
||||
func productionForwardLogEntry(event string, local PeerIdentity, envelope ProductionEnvelope, reason string, statusCode int) ProductionForwardLogEntry {
|
||||
return ProductionForwardLogEntry{
|
||||
Event: event,
|
||||
RouteID: envelope.RouteID,
|
||||
MessageID: envelope.MessageID,
|
||||
ClusterID: envelope.ClusterID,
|
||||
LocalNodeID: local.NodeID,
|
||||
SourceNodeID: envelope.SourceNodeID,
|
||||
DestinationNodeID: envelope.DestinationNodeID,
|
||||
CurrentHopNodeID: envelope.CurrentHopNodeID,
|
||||
NextHopNodeID: envelope.NextHopNodeID,
|
||||
ChannelClass: envelope.ChannelClass,
|
||||
MessageType: envelope.MessageType,
|
||||
Reason: reason,
|
||||
StatusCode: statusCode,
|
||||
TTL: envelope.TTL,
|
||||
HopCount: envelope.HopCount,
|
||||
RoutePathLength: len(envelope.RoutePath),
|
||||
VisitedCount: len(envelope.VisitedNodeIDs),
|
||||
PayloadLength: envelope.PayloadLength,
|
||||
OccurredAt: time.Now().UTC(),
|
||||
}
|
||||
}
|
||||
|
||||
func nextProductionHopAfter(routePath []string, currentNodeID string, destinationNodeID string) string {
|
||||
if len(routePath) == 0 {
|
||||
return destinationNodeID
|
||||
}
|
||||
for index, nodeID := range routePath {
|
||||
if nodeID == currentNodeID {
|
||||
if index >= len(routePath)-1 {
|
||||
return currentNodeID
|
||||
}
|
||||
return routePath[index+1]
|
||||
}
|
||||
}
|
||||
return destinationNodeID
|
||||
}
|
||||
|
||||
func writeProductionForwardResult(w http.ResponseWriter, result ProductionForwardResult) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(result)
|
||||
}
|
||||
|
||||
func observeProductionEnvelope(ctx context.Context, observer ProductionEnvelopeObserver, observation ProductionEnvelopeObservation) (err error) {
|
||||
if observer == nil {
|
||||
return nil
|
||||
}
|
||||
defer func() {
|
||||
if recover() != nil {
|
||||
err = ErrForwardObservationFailed
|
||||
}
|
||||
}()
|
||||
return observer(ctx, observation)
|
||||
}
|
||||
|
||||
func (s Server) handleSyntheticProbe(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
w.WriteHeader(http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
if s.SyntheticRuntime == nil {
|
||||
http.Error(w, ErrMeshRuntimeDisabled.Error(), http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
var envelope SyntheticEnvelope
|
||||
if err := json.NewDecoder(r.Body).Decode(&envelope); err != nil {
|
||||
http.Error(w, "invalid synthetic mesh envelope", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
ack, err := s.SyntheticRuntime.Receive(r.Context(), envelope)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), syntheticStatusCode(err))
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(ack)
|
||||
}
|
||||
|
||||
func NewHealthMessage(from, to PeerIdentity) HealthMessage {
|
||||
status := "reachable"
|
||||
return HealthMessage{
|
||||
ProtocolVersion: ProtocolVersion,
|
||||
From: from,
|
||||
To: to,
|
||||
ObservedAt: time.Now().UTC(),
|
||||
LinkStatus: status,
|
||||
}
|
||||
}
|
||||
|
||||
func syntheticStatusCode(err error) int {
|
||||
switch err {
|
||||
case ErrClusterMismatch, ErrNodeMismatch, ErrUnauthorizedChannel, ErrLoopDetected:
|
||||
return http.StatusForbidden
|
||||
case ErrMeshRuntimeDisabled:
|
||||
return http.StatusServiceUnavailable
|
||||
case ErrRouteExpired, ErrTTLExhausted, ErrInvalidRoutePath, ErrUnsupportedSyntheticMessage, ErrRouteIDRequired:
|
||||
return http.StatusBadRequest
|
||||
case ErrRouteNotFound, ErrSyntheticPeerUnavailable:
|
||||
return http.StatusNotFound
|
||||
default:
|
||||
return http.StatusBadRequest
|
||||
}
|
||||
}
|
||||
|
||||
func forwardStatusCode(err error) int {
|
||||
switch err {
|
||||
case ErrClusterMismatch, ErrNodeMismatch, ErrUnauthorizedChannel, ErrLoopDetected:
|
||||
return http.StatusForbidden
|
||||
case ErrRouteExpired, ErrTTLExhausted, ErrInvalidRoutePath, ErrRouteIDRequired:
|
||||
return http.StatusBadRequest
|
||||
case ErrForwardRuntimeUnavailable:
|
||||
return http.StatusNotImplemented
|
||||
case ErrRouteNotFound:
|
||||
return http.StatusNotFound
|
||||
case ErrForwardPeerUnavailable:
|
||||
return http.StatusBadGateway
|
||||
default:
|
||||
return http.StatusBadRequest
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,802 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestMeshHealthAcceptsSameCluster(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
server := httptest.NewServer(Server{Local: local}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
client := NewClient(server.URL)
|
||||
ack, err := client.SendHealth(context.Background(), NewHealthMessage(
|
||||
PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
local,
|
||||
))
|
||||
if err != nil {
|
||||
t.Fatalf("send health: %v", err)
|
||||
}
|
||||
if !ack.Accepted || ack.By.NodeID != "node-b" {
|
||||
t.Fatalf("unexpected ack: %+v", ack)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshHealthRejectsClusterMismatch(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
server := httptest.NewServer(Server{Local: local}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
message := NewHealthMessage(PeerIdentity{ClusterID: "cluster-2", NodeID: "node-a"}, local)
|
||||
payload, err := json.Marshal(message)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal message: %v", err)
|
||||
}
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/health", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post health: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusForbidden {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingDisabled(t *testing.T) {
|
||||
server := httptest.NewServer(Server{Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/octet-stream", bytes.NewReader([]byte("payload")))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusNotImplemented {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateEnabledStillHasNoProductionRuntime(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
server := httptest.NewServer(Server{
|
||||
Local: local,
|
||||
ProductionForwardingEnabled: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
payload, err := json.Marshal(validProductionEnvelope(local))
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusNotImplemented {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateDeliversFabricControlAtDestination(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
|
||||
var events []ProductionForwardLogEntry
|
||||
server := httptest.NewServer(Server{
|
||||
Local: local,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
|
||||
events = append(events, entry)
|
||||
},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
envelope := validProductionEnvelope(local)
|
||||
envelope.SourceNodeID = "node-a"
|
||||
envelope.DestinationNodeID = local.NodeID
|
||||
envelope.CurrentHopNodeID = local.NodeID
|
||||
envelope.NextHopNodeID = local.NodeID
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK)
|
||||
}
|
||||
var result ProductionForwardResult
|
||||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
||||
t.Fatalf("decode result: %v", err)
|
||||
}
|
||||
if !result.Accepted || !result.Delivered || result.Forwarded || result.By.NodeID != local.NodeID {
|
||||
t.Fatalf("unexpected result: %+v", result)
|
||||
}
|
||||
if !hasProductionForwardEvent(events, "production_forward_accepted") || !hasProductionForwardEvent(events, "production_forward_delivered") {
|
||||
t.Fatalf("missing production forward events: %+v", events)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateForwardsDirectFabricControlToNextHop(t *testing.T) {
|
||||
nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
|
||||
var deliveredObservation ProductionEnvelopeObservation
|
||||
serverC := httptest.NewServer(Server{
|
||||
Local: nodeC,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
|
||||
deliveredObservation = observation
|
||||
return nil
|
||||
},
|
||||
}.Handler())
|
||||
defer serverC.Close()
|
||||
|
||||
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
serverB := httptest.NewServer(Server{
|
||||
Local: nodeB,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
|
||||
nodeC.NodeID: serverC.URL,
|
||||
}),
|
||||
}.Handler())
|
||||
defer serverB.Close()
|
||||
|
||||
envelope := validProductionEnvelope(nodeB)
|
||||
envelope.SourceNodeID = "node-a"
|
||||
envelope.DestinationNodeID = nodeC.NodeID
|
||||
envelope.CurrentHopNodeID = nodeB.NodeID
|
||||
envelope.NextHopNodeID = nodeC.NodeID
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(serverB.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK)
|
||||
}
|
||||
var result ProductionForwardResult
|
||||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
||||
t.Fatalf("decode result: %v", err)
|
||||
}
|
||||
if !result.Accepted || !result.Forwarded || !result.Delivered || result.NextNodeID != nodeC.NodeID || result.By.NodeID != nodeB.NodeID {
|
||||
t.Fatalf("unexpected forward result: %+v", result)
|
||||
}
|
||||
if deliveredObservation.CurrentHopNodeID != nodeC.NodeID || deliveredObservation.MessageID != envelope.MessageID {
|
||||
t.Fatalf("destination did not observe forwarded envelope: %+v", deliveredObservation)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateForwardsMultiHopFabricControlByRoutePath(t *testing.T) {
|
||||
nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
|
||||
var deliveredObservation ProductionEnvelopeObservation
|
||||
var nodeREvents []ProductionForwardLogEntry
|
||||
var nodeBEvents []ProductionForwardLogEntry
|
||||
serverC := httptest.NewServer(Server{
|
||||
Local: nodeC,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
|
||||
deliveredObservation = observation
|
||||
return nil
|
||||
},
|
||||
}.Handler())
|
||||
defer serverC.Close()
|
||||
|
||||
nodeR := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}
|
||||
serverR := httptest.NewServer(Server{
|
||||
Local: nodeR,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
|
||||
nodeC.NodeID: serverC.URL,
|
||||
}),
|
||||
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
|
||||
nodeREvents = append(nodeREvents, entry)
|
||||
},
|
||||
}.Handler())
|
||||
defer serverR.Close()
|
||||
|
||||
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
serverB := httptest.NewServer(Server{
|
||||
Local: nodeB,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
|
||||
nodeR.NodeID: serverR.URL,
|
||||
}),
|
||||
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
|
||||
nodeBEvents = append(nodeBEvents, entry)
|
||||
},
|
||||
}.Handler())
|
||||
defer serverB.Close()
|
||||
|
||||
envelope := validProductionEnvelope(nodeB)
|
||||
envelope.SourceNodeID = "node-a"
|
||||
envelope.DestinationNodeID = nodeC.NodeID
|
||||
envelope.CurrentHopNodeID = nodeB.NodeID
|
||||
envelope.NextHopNodeID = nodeR.NodeID
|
||||
envelope.RoutePath = []string{"node-a", nodeB.NodeID, nodeR.NodeID, nodeC.NodeID}
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(serverB.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK)
|
||||
}
|
||||
var result ProductionForwardResult
|
||||
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
||||
t.Fatalf("decode result: %v", err)
|
||||
}
|
||||
if !result.Accepted || !result.Forwarded || !result.Delivered || result.NextNodeID != nodeR.NodeID || result.By.NodeID != nodeB.NodeID {
|
||||
t.Fatalf("unexpected multi-hop result: %+v", result)
|
||||
}
|
||||
if deliveredObservation.CurrentHopNodeID != nodeC.NodeID || deliveredObservation.NextHopNodeID != nodeC.NodeID {
|
||||
t.Fatalf("destination did not observe final hop: %+v", deliveredObservation)
|
||||
}
|
||||
if len(deliveredObservation.VisitedNodeIDs) != 2 || deliveredObservation.VisitedNodeIDs[0] != nodeB.NodeID || deliveredObservation.VisitedNodeIDs[1] != nodeR.NodeID {
|
||||
t.Fatalf("visited path not propagated: %+v", deliveredObservation.VisitedNodeIDs)
|
||||
}
|
||||
if !hasProductionForwardEvent(nodeBEvents, "production_forward_forwarded") || !hasProductionForwardEvent(nodeREvents, "production_forward_forwarded") {
|
||||
t.Fatalf("missing relay forward events: nodeB=%+v nodeR=%+v", nodeBEvents, nodeREvents)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateForwardsConfiguredProductionRoute(t *testing.T) {
|
||||
nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
|
||||
route := configuredProductionRoute("route-1", []string{"node-a", "node-b", "node-r", nodeC.NodeID})
|
||||
var deliveredObservation ProductionEnvelopeObservation
|
||||
serverC := httptest.NewServer(Server{
|
||||
Local: nodeC,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionRoutes: []SyntheticRoute{route},
|
||||
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
|
||||
deliveredObservation = observation
|
||||
return nil
|
||||
},
|
||||
}.Handler())
|
||||
defer serverC.Close()
|
||||
|
||||
nodeR := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}
|
||||
serverR := httptest.NewServer(Server{
|
||||
Local: nodeR,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionRoutes: []SyntheticRoute{route},
|
||||
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
|
||||
nodeC.NodeID: serverC.URL,
|
||||
}),
|
||||
}.Handler())
|
||||
defer serverR.Close()
|
||||
|
||||
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
serverB := httptest.NewServer(Server{
|
||||
Local: nodeB,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionRoutes: []SyntheticRoute{route},
|
||||
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
|
||||
nodeR.NodeID: serverR.URL,
|
||||
}),
|
||||
}.Handler())
|
||||
defer serverB.Close()
|
||||
|
||||
envelope := validProductionEnvelope(nodeB)
|
||||
envelope.SourceNodeID = "node-a"
|
||||
envelope.DestinationNodeID = nodeC.NodeID
|
||||
envelope.CurrentHopNodeID = nodeB.NodeID
|
||||
envelope.NextHopNodeID = nodeR.NodeID
|
||||
envelope.RoutePath = route.Hops
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(serverB.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK)
|
||||
}
|
||||
if deliveredObservation.RouteID != route.RouteID || deliveredObservation.CurrentHopNodeID != nodeC.NodeID {
|
||||
t.Fatalf("configured route was not delivered: %+v", deliveredObservation)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateRejectsUnknownConfiguredProductionRoute(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
server := httptest.NewServer(Server{
|
||||
Local: local,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionRoutes: []SyntheticRoute{
|
||||
configuredProductionRoute("route-other", []string{"node-a", local.NodeID, "node-c"}),
|
||||
},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
envelope := validProductionEnvelope(local)
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusNotFound {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotFound)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateRejectsConfiguredProductionRouteWrongNextHop(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
route := configuredProductionRoute("route-1", []string{"node-a", local.NodeID, "node-r", "node-c"})
|
||||
server := httptest.NewServer(Server{
|
||||
Local: local,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionRoutes: []SyntheticRoute{route},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
envelope := validProductionEnvelope(local)
|
||||
envelope.SourceNodeID = "node-a"
|
||||
envelope.DestinationNodeID = "node-c"
|
||||
envelope.CurrentHopNodeID = local.NodeID
|
||||
envelope.NextHopNodeID = "node-c"
|
||||
envelope.RoutePath = route.Hops
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateRejectsRoutePathWrongNextHop(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
var events []ProductionForwardLogEntry
|
||||
server := httptest.NewServer(Server{
|
||||
Local: local,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
|
||||
events = append(events, entry)
|
||||
},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
envelope := validProductionEnvelope(local)
|
||||
envelope.SourceNodeID = "node-a"
|
||||
envelope.DestinationNodeID = "node-c"
|
||||
envelope.CurrentHopNodeID = local.NodeID
|
||||
envelope.NextHopNodeID = "node-x"
|
||||
envelope.RoutePath = []string{"node-a", local.NodeID, "node-r", "node-c"}
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
|
||||
}
|
||||
if !hasProductionForwardEvent(events, "production_forward_rejected") {
|
||||
t.Fatalf("missing reject event: %+v", events)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateRejectsRoutePathLoop(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
server := httptest.NewServer(Server{
|
||||
Local: local,
|
||||
ProductionForwardingEnabled: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
envelope := validProductionEnvelope(local)
|
||||
envelope.SourceNodeID = "node-a"
|
||||
envelope.DestinationNodeID = "node-c"
|
||||
envelope.CurrentHopNodeID = local.NodeID
|
||||
envelope.NextHopNodeID = "node-r"
|
||||
envelope.RoutePath = []string{"node-a", local.NodeID, "node-r", local.NodeID, "node-c"}
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusForbidden {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateRejectsInvalidProductionEnvelope(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
server := httptest.NewServer(Server{
|
||||
Local: local,
|
||||
ProductionForwardingEnabled: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
envelope := validProductionEnvelope(local)
|
||||
envelope.PayloadHash = "bad-hash"
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateRejectsOversizedProductionEnvelopePayload(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
observed := false
|
||||
server := httptest.NewServer(Server{
|
||||
Local: local,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
|
||||
observed = true
|
||||
return nil
|
||||
},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
envelope := validProductionEnvelope(local)
|
||||
envelope.Payload = json.RawMessage(`"` + string(bytes.Repeat([]byte("a"), MaxProductionEnvelopePayloadBytes+1)) + `"`)
|
||||
sum := sha256.Sum256(envelope.Payload)
|
||||
envelope.PayloadLength = len(envelope.Payload)
|
||||
envelope.PayloadHash = hex.EncodeToString(sum[:])
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
|
||||
}
|
||||
if observed {
|
||||
t.Fatal("observer called for oversized envelope")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateRejectsFutureCreatedAt(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
observed := false
|
||||
server := httptest.NewServer(Server{
|
||||
Local: local,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
|
||||
observed = true
|
||||
return nil
|
||||
},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
envelope := validProductionEnvelope(local)
|
||||
envelope.CreatedAt = time.Now().UTC().Add(MaxProductionEnvelopeFutureSkew + time.Second)
|
||||
envelope.ExpiresAt = envelope.CreatedAt.Add(time.Minute)
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusBadRequest {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
|
||||
}
|
||||
if observed {
|
||||
t.Fatal("observer called for future-created envelope")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateObservesValidEnvelopeWithoutPayload(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
var observed ProductionEnvelopeObservation
|
||||
server := httptest.NewServer(Server{
|
||||
Local: local,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
|
||||
observed = observation
|
||||
return nil
|
||||
},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
envelope := validProductionEnvelope(local)
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusNotImplemented {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented)
|
||||
}
|
||||
if observed.MessageID != envelope.MessageID || observed.RouteID != envelope.RouteID {
|
||||
t.Fatalf("unexpected observation: %+v", observed)
|
||||
}
|
||||
if observed.PayloadHash != envelope.PayloadHash || observed.PayloadLength != envelope.PayloadLength {
|
||||
t.Fatalf("payload metadata missing from observation: %+v", observed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateDoesNotObserveRejectedEnvelope(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
observed := false
|
||||
server := httptest.NewServer(Server{
|
||||
Local: local,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
|
||||
observed = true
|
||||
return nil
|
||||
},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
envelope := validProductionEnvelope(local)
|
||||
envelope.ClusterID = "wrong-cluster"
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusForbidden {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden)
|
||||
}
|
||||
if observed {
|
||||
t.Fatal("observer called for rejected envelope")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateFailsClosedWhenObservationFails(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
server := httptest.NewServer(Server{
|
||||
Local: local,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
|
||||
return errors.New("observer down")
|
||||
},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
payload, err := json.Marshal(validProductionEnvelope(local))
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusInternalServerError {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusInternalServerError)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateFailsClosedWhenObservationPanics(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
server := httptest.NewServer(Server{
|
||||
Local: local,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
|
||||
panic("observer panic")
|
||||
},
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
payload, err := json.Marshal(validProductionEnvelope(local))
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusInternalServerError {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusInternalServerError)
|
||||
}
|
||||
}
|
||||
|
||||
func TestObserveProductionEnvelopeAllowsNilObserver(t *testing.T) {
|
||||
if err := observeProductionEnvelope(context.Background(), nil, ProductionEnvelopeObservation{}); err != nil {
|
||||
t.Fatalf("observeProductionEnvelope nil observer err = %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestProductionEnvelopeObservationSinkKeepsBoundedMetadata(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
sink := NewProductionEnvelopeObservationSink(2)
|
||||
server := httptest.NewServer(Server{
|
||||
Local: local,
|
||||
ProductionForwardingEnabled: true,
|
||||
ProductionEnvelopeObserver: sink.Observe,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
for i := 1; i <= 3; i++ {
|
||||
envelope := validProductionEnvelope(local)
|
||||
envelope.MessageID = "message-" + string(rune('0'+i))
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusNotImplemented {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented)
|
||||
}
|
||||
}
|
||||
|
||||
observations := sink.Snapshot()
|
||||
if len(observations) != 2 {
|
||||
t.Fatalf("observation count = %d, want 2", len(observations))
|
||||
}
|
||||
if observations[0].MessageID != "message-2" || observations[1].MessageID != "message-3" {
|
||||
t.Fatalf("unexpected bounded observations: %+v", observations)
|
||||
}
|
||||
if observations[0].PayloadHash == "" || observations[0].PayloadLength == 0 {
|
||||
t.Fatalf("payload metadata missing from bounded observation: %+v", observations[0])
|
||||
}
|
||||
metrics := sink.Metrics()
|
||||
if metrics.Capacity != 2 || metrics.CurrentDepth != 2 || metrics.AcceptedTotal != 3 || metrics.DroppedOldest != 1 {
|
||||
t.Fatalf("unexpected sink metrics: %+v", metrics)
|
||||
}
|
||||
}
|
||||
|
||||
func TestProductionEnvelopeObservationSinkMetricsStartEmpty(t *testing.T) {
|
||||
sink := NewProductionEnvelopeObservationSink(3)
|
||||
metrics := sink.Metrics()
|
||||
if metrics.Capacity != 3 || metrics.CurrentDepth != 0 || metrics.AcceptedTotal != 0 || metrics.DroppedOldest != 0 {
|
||||
t.Fatalf("unexpected empty metrics: %+v", metrics)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingGateRejectsServiceChannel(t *testing.T) {
|
||||
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
|
||||
server := httptest.NewServer(Server{
|
||||
Local: local,
|
||||
ProductionForwardingEnabled: true,
|
||||
}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
envelope := validProductionEnvelope(local)
|
||||
envelope.ChannelClass = "render"
|
||||
payload, err := json.Marshal(envelope)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal envelope: %v", err)
|
||||
}
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
|
||||
if err != nil {
|
||||
t.Fatalf("post forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusForbidden {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshForwardingRequiresPost(t *testing.T) {
|
||||
server := httptest.NewServer(Server{Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
resp, err := http.Get(server.URL + "/mesh/v1/forward")
|
||||
if err != nil {
|
||||
t.Fatalf("get forward: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusMethodNotAllowed {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusMethodNotAllowed)
|
||||
}
|
||||
}
|
||||
|
||||
func validProductionEnvelope(local PeerIdentity) ProductionEnvelope {
|
||||
payload := json.RawMessage(`{"kind":"control"}`)
|
||||
sum := sha256.Sum256(payload)
|
||||
now := time.Now().UTC()
|
||||
return ProductionEnvelope{
|
||||
FabricProtocolVersion: ProtocolVersion,
|
||||
MessageID: "message-1",
|
||||
RouteID: "route-1",
|
||||
ClusterID: local.ClusterID,
|
||||
SourceNodeID: "node-a",
|
||||
DestinationNodeID: "node-c",
|
||||
CurrentHopNodeID: local.NodeID,
|
||||
NextHopNodeID: "node-c",
|
||||
ChannelClass: ProductionChannelFabricControl,
|
||||
MessageType: ProductionMessageFabricControl,
|
||||
TTL: 4,
|
||||
HopCount: 1,
|
||||
CreatedAt: now,
|
||||
ExpiresAt: now.Add(time.Minute),
|
||||
PayloadLength: len(payload),
|
||||
PayloadHash: hex.EncodeToString(sum[:]),
|
||||
Payload: payload,
|
||||
}
|
||||
}
|
||||
|
||||
func configuredProductionRoute(routeID string, hops []string) SyntheticRoute {
|
||||
return SyntheticRoute{
|
||||
RouteID: routeID,
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: hops[0],
|
||||
DestinationNodeID: hops[len(hops)-1],
|
||||
Hops: append([]string{}, hops...),
|
||||
AllowedChannels: []string{ProductionChannelFabricControl},
|
||||
ExpiresAt: time.Now().UTC().Add(time.Hour),
|
||||
MaxTTL: 8,
|
||||
MaxHops: 8,
|
||||
}
|
||||
}
|
||||
|
||||
func hasProductionForwardEvent(events []ProductionForwardLogEntry, event string) bool {
|
||||
for _, item := range events {
|
||||
if item.Event == event {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func TestSyntheticEndpointDisabledByDefault(t *testing.T) {
|
||||
server := httptest.NewServer(Server{Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}.Handler())
|
||||
defer server.Close()
|
||||
|
||||
resp, err := http.Post(server.URL+"/mesh/v1/synthetic/probe", "application/json", bytes.NewReader([]byte(`{}`)))
|
||||
if err != nil {
|
||||
t.Fatalf("post synthetic probe: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusServiceUnavailable {
|
||||
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusServiceUnavailable)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,280 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
type SyntheticRelaySchedulerConfig struct {
|
||||
Enabled bool
|
||||
Local PeerIdentity
|
||||
QueuePolicies []SyntheticRelayQueuePolicy
|
||||
AllowedChannels []string
|
||||
AllowedMessageTypes []string
|
||||
Now func() time.Time
|
||||
Logger func(SyntheticLogEntry)
|
||||
}
|
||||
|
||||
type SyntheticRelayScheduler struct {
|
||||
enabled bool
|
||||
local PeerIdentity
|
||||
policies map[string]SyntheticRelayQueuePolicy
|
||||
allowedChannels map[string]struct{}
|
||||
allowedMessageTypes map[string]struct{}
|
||||
priorityOrder []string
|
||||
now func() time.Time
|
||||
logger func(SyntheticLogEntry)
|
||||
|
||||
mu sync.Mutex
|
||||
queues map[string][]SyntheticEnvelope
|
||||
metrics SyntheticRelayQueueMetrics
|
||||
}
|
||||
|
||||
func NewSyntheticRelayScheduler(cfg SyntheticRelaySchedulerConfig) *SyntheticRelayScheduler {
|
||||
policies := cfg.QueuePolicies
|
||||
if len(policies) == 0 {
|
||||
policies = []SyntheticRelayQueuePolicy{
|
||||
{Channel: SyntheticChannelFabricControl, Capacity: 64, Droppable: false},
|
||||
{Channel: SyntheticChannelRouteControl, Capacity: 64, Droppable: false},
|
||||
{Channel: SyntheticChannelTelemetry, Capacity: 16, Droppable: true},
|
||||
}
|
||||
}
|
||||
policyMap := map[string]SyntheticRelayQueuePolicy{}
|
||||
allowedChannels := map[string]struct{}{}
|
||||
priorityOrder := make([]string, 0, len(policies))
|
||||
for _, policy := range policies {
|
||||
if policy.Channel == "" {
|
||||
continue
|
||||
}
|
||||
if policy.Capacity <= 0 {
|
||||
policy.Capacity = 1
|
||||
}
|
||||
policyMap[policy.Channel] = policy
|
||||
allowedChannels[policy.Channel] = struct{}{}
|
||||
priorityOrder = append(priorityOrder, policy.Channel)
|
||||
}
|
||||
for _, channel := range cfg.AllowedChannels {
|
||||
if channel != "" {
|
||||
allowedChannels[channel] = struct{}{}
|
||||
}
|
||||
}
|
||||
messageTypes := cfg.AllowedMessageTypes
|
||||
if len(messageTypes) == 0 {
|
||||
messageTypes = []string{
|
||||
SyntheticMessageProbe,
|
||||
SyntheticMessageProbeAck,
|
||||
SyntheticMessageRouteHealth,
|
||||
SyntheticMessageRouteHealthAck,
|
||||
SyntheticMessageTelemetry,
|
||||
SyntheticMessageTestService,
|
||||
SyntheticMessageTestServiceAck,
|
||||
}
|
||||
}
|
||||
allowedMessageTypes := map[string]struct{}{}
|
||||
for _, messageType := range messageTypes {
|
||||
if messageType != "" {
|
||||
allowedMessageTypes[messageType] = struct{}{}
|
||||
}
|
||||
}
|
||||
now := cfg.Now
|
||||
if now == nil {
|
||||
now = func() time.Time { return time.Now().UTC() }
|
||||
}
|
||||
return &SyntheticRelayScheduler{
|
||||
enabled: cfg.Enabled,
|
||||
local: cfg.Local,
|
||||
policies: policyMap,
|
||||
allowedChannels: allowedChannels,
|
||||
allowedMessageTypes: allowedMessageTypes,
|
||||
priorityOrder: priorityOrder,
|
||||
now: now,
|
||||
logger: cfg.Logger,
|
||||
queues: map[string][]SyntheticEnvelope{},
|
||||
metrics: SyntheticRelayQueueMetrics{
|
||||
QueueDepths: map[string]int{},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SyntheticRelayScheduler) Enqueue(envelope SyntheticEnvelope) (SyntheticRelayEnqueueResult, error) {
|
||||
if err := s.validateEnvelope(envelope); err != nil {
|
||||
s.reject(envelope, err)
|
||||
return SyntheticRelayEnqueueResult{}, err
|
||||
}
|
||||
policy := s.policies[envelope.Channel]
|
||||
result := SyntheticRelayEnqueueResult{
|
||||
Channel: envelope.Channel,
|
||||
QueueCapacity: policy.Capacity,
|
||||
AcceptedSequence: envelope.Sequence,
|
||||
}
|
||||
s.mu.Lock()
|
||||
queue := s.queues[envelope.Channel]
|
||||
if len(queue) >= policy.Capacity {
|
||||
if !policy.Droppable {
|
||||
s.metrics.Rejected++
|
||||
s.metrics.LastRejectReason = ErrSyntheticRelayQueueFull.Error()
|
||||
s.mu.Unlock()
|
||||
s.log(SyntheticLogEntry{
|
||||
Event: "fabric_relay_rejected",
|
||||
RouteID: envelope.RouteID,
|
||||
ClusterID: envelope.ClusterID,
|
||||
LocalNodeID: s.local.NodeID,
|
||||
Channel: envelope.Channel,
|
||||
MessageType: envelope.MessageType,
|
||||
Reason: ErrSyntheticRelayQueueFull.Error(),
|
||||
QueueDepth: len(queue),
|
||||
QueueCapacity: policy.Capacity,
|
||||
OccurredAt: s.now(),
|
||||
})
|
||||
return SyntheticRelayEnqueueResult{}, ErrSyntheticRelayQueueFull
|
||||
}
|
||||
result.Dropped = true
|
||||
result.DroppedSequence = queue[0].Sequence
|
||||
queue = queue[1:]
|
||||
s.metrics.Dropped++
|
||||
}
|
||||
queue = append(queue, envelope)
|
||||
s.queues[envelope.Channel] = queue
|
||||
result.QueueDepth = len(queue)
|
||||
s.metrics.Enqueued++
|
||||
s.metrics.QueueDepths[envelope.Channel] = len(queue)
|
||||
s.mu.Unlock()
|
||||
s.log(SyntheticLogEntry{
|
||||
Event: "fabric_relay_enqueued",
|
||||
RouteID: envelope.RouteID,
|
||||
ClusterID: envelope.ClusterID,
|
||||
LocalNodeID: s.local.NodeID,
|
||||
Channel: envelope.Channel,
|
||||
MessageType: envelope.MessageType,
|
||||
QueueDepth: result.QueueDepth,
|
||||
QueueCapacity: result.QueueCapacity,
|
||||
Dropped: result.Dropped,
|
||||
DroppedSequence: result.DroppedSequence,
|
||||
OccurredAt: s.now(),
|
||||
})
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (s *SyntheticRelayScheduler) Dequeue() (SyntheticEnvelope, error) {
|
||||
if !s.enabled {
|
||||
return SyntheticEnvelope{}, ErrMeshRuntimeDisabled
|
||||
}
|
||||
s.mu.Lock()
|
||||
for _, channel := range s.priorityOrder {
|
||||
queue := s.queues[channel]
|
||||
if len(queue) == 0 {
|
||||
continue
|
||||
}
|
||||
envelope := queue[0]
|
||||
queue = queue[1:]
|
||||
s.queues[channel] = queue
|
||||
s.metrics.Dequeued++
|
||||
s.metrics.QueueDepths[channel] = len(queue)
|
||||
s.mu.Unlock()
|
||||
s.log(SyntheticLogEntry{
|
||||
Event: "fabric_relay_dequeued",
|
||||
RouteID: envelope.RouteID,
|
||||
ClusterID: envelope.ClusterID,
|
||||
LocalNodeID: s.local.NodeID,
|
||||
Channel: envelope.Channel,
|
||||
MessageType: envelope.MessageType,
|
||||
QueueDepth: len(queue),
|
||||
QueueCapacity: s.policies[channel].Capacity,
|
||||
OccurredAt: s.now(),
|
||||
})
|
||||
return envelope, nil
|
||||
}
|
||||
s.mu.Unlock()
|
||||
return SyntheticEnvelope{}, ErrSyntheticRelayQueueEmpty
|
||||
}
|
||||
|
||||
func (s *SyntheticRelayScheduler) SnapshotQueueMetrics() SyntheticRelayQueueMetrics {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
depths := map[string]int{}
|
||||
for channel, depth := range s.metrics.QueueDepths {
|
||||
depths[channel] = depth
|
||||
}
|
||||
for channel, queue := range s.queues {
|
||||
depths[channel] = len(queue)
|
||||
}
|
||||
return SyntheticRelayQueueMetrics{
|
||||
Enqueued: s.metrics.Enqueued,
|
||||
Dequeued: s.metrics.Dequeued,
|
||||
Dropped: s.metrics.Dropped,
|
||||
Rejected: s.metrics.Rejected,
|
||||
LastRejectReason: s.metrics.LastRejectReason,
|
||||
QueueDepths: depths,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SyntheticRelayScheduler) validateEnvelope(envelope SyntheticEnvelope) error {
|
||||
if s == nil || !s.enabled {
|
||||
return ErrMeshRuntimeDisabled
|
||||
}
|
||||
if envelope.ProtocolVersion != ProtocolVersion {
|
||||
return ErrUnsupportedSyntheticMessage
|
||||
}
|
||||
if envelope.RouteID == "" {
|
||||
return ErrRouteIDRequired
|
||||
}
|
||||
if envelope.ClusterID == "" || envelope.ClusterID != s.local.ClusterID {
|
||||
return ErrClusterMismatch
|
||||
}
|
||||
if envelope.From.ClusterID != s.local.ClusterID || envelope.From.NodeID == "" {
|
||||
return ErrNodeMismatch
|
||||
}
|
||||
if envelope.To.ClusterID != s.local.ClusterID || envelope.To.NodeID != s.local.NodeID {
|
||||
return ErrNodeMismatch
|
||||
}
|
||||
if envelope.TTL <= 0 {
|
||||
return ErrTTLExhausted
|
||||
}
|
||||
if envelope.HopCount <= 0 {
|
||||
return ErrInvalidRoutePath
|
||||
}
|
||||
if contains(envelope.Visited, s.local.NodeID) {
|
||||
return ErrLoopDetected
|
||||
}
|
||||
if _, ok := s.allowedChannels[envelope.Channel]; !ok {
|
||||
return ErrUnauthorizedChannel
|
||||
}
|
||||
if _, ok := s.policies[envelope.Channel]; !ok {
|
||||
return ErrUnauthorizedChannel
|
||||
}
|
||||
if _, ok := s.allowedMessageTypes[envelope.MessageType]; !ok {
|
||||
return ErrUnsupportedSyntheticMessage
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *SyntheticRelayScheduler) reject(envelope SyntheticEnvelope, err error) {
|
||||
reason := ""
|
||||
if err != nil {
|
||||
reason = err.Error()
|
||||
}
|
||||
if s != nil {
|
||||
s.mu.Lock()
|
||||
s.metrics.Rejected++
|
||||
s.metrics.LastRejectReason = reason
|
||||
s.mu.Unlock()
|
||||
}
|
||||
if s != nil {
|
||||
s.log(SyntheticLogEntry{
|
||||
Event: "fabric_relay_rejected",
|
||||
RouteID: envelope.RouteID,
|
||||
ClusterID: envelope.ClusterID,
|
||||
LocalNodeID: s.local.NodeID,
|
||||
Channel: envelope.Channel,
|
||||
MessageType: envelope.MessageType,
|
||||
Reason: reason,
|
||||
OccurredAt: s.now(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SyntheticRelayScheduler) log(entry SyntheticLogEntry) {
|
||||
if s.logger != nil {
|
||||
s.logger(entry)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,213 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestSyntheticRelaySchedulerDequeuesByQoSPriority(t *testing.T) {
|
||||
scheduler := testRelayScheduler()
|
||||
telemetry := testRelayEnvelope(SyntheticChannelTelemetry, SyntheticMessageTelemetry, 1)
|
||||
routeControl := testRelayEnvelope(SyntheticChannelRouteControl, SyntheticMessageRouteHealth, 2)
|
||||
fabricControl := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 3)
|
||||
|
||||
if _, err := scheduler.Enqueue(telemetry); err != nil {
|
||||
t.Fatalf("enqueue telemetry: %v", err)
|
||||
}
|
||||
if _, err := scheduler.Enqueue(routeControl); err != nil {
|
||||
t.Fatalf("enqueue route control: %v", err)
|
||||
}
|
||||
if _, err := scheduler.Enqueue(fabricControl); err != nil {
|
||||
t.Fatalf("enqueue fabric control: %v", err)
|
||||
}
|
||||
|
||||
first, err := scheduler.Dequeue()
|
||||
if err != nil {
|
||||
t.Fatalf("dequeue first: %v", err)
|
||||
}
|
||||
second, err := scheduler.Dequeue()
|
||||
if err != nil {
|
||||
t.Fatalf("dequeue second: %v", err)
|
||||
}
|
||||
third, err := scheduler.Dequeue()
|
||||
if err != nil {
|
||||
t.Fatalf("dequeue third: %v", err)
|
||||
}
|
||||
if first.Channel != SyntheticChannelFabricControl {
|
||||
t.Fatalf("first channel = %q, want fabric_control", first.Channel)
|
||||
}
|
||||
if second.Channel != SyntheticChannelRouteControl {
|
||||
t.Fatalf("second channel = %q, want route_control", second.Channel)
|
||||
}
|
||||
if third.Channel != SyntheticChannelTelemetry {
|
||||
t.Fatalf("third channel = %q, want telemetry", third.Channel)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRelaySchedulerDropsOldestTelemetryOnly(t *testing.T) {
|
||||
scheduler := testRelayScheduler()
|
||||
first := testRelayEnvelope(SyntheticChannelTelemetry, SyntheticMessageTelemetry, 1)
|
||||
second := testRelayEnvelope(SyntheticChannelTelemetry, SyntheticMessageTelemetry, 2)
|
||||
|
||||
if result, err := scheduler.Enqueue(first); err != nil || result.Dropped {
|
||||
t.Fatalf("enqueue first result=%+v err=%v", result, err)
|
||||
}
|
||||
result, err := scheduler.Enqueue(second)
|
||||
if err != nil {
|
||||
t.Fatalf("enqueue second: %v", err)
|
||||
}
|
||||
if !result.Dropped || result.DroppedSequence != 1 {
|
||||
t.Fatalf("result = %+v, want dropped sequence 1", result)
|
||||
}
|
||||
dequeued, err := scheduler.Dequeue()
|
||||
if err != nil {
|
||||
t.Fatalf("dequeue: %v", err)
|
||||
}
|
||||
if dequeued.Sequence != 2 {
|
||||
t.Fatalf("dequeued sequence = %d, want 2", dequeued.Sequence)
|
||||
}
|
||||
metrics := scheduler.SnapshotQueueMetrics()
|
||||
if metrics.Dropped != 1 || metrics.Enqueued != 2 {
|
||||
t.Fatalf("metrics = %+v, want one drop and two enqueues", metrics)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRelaySchedulerRejectsFullReliableQueue(t *testing.T) {
|
||||
scheduler := testRelayScheduler()
|
||||
first := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1)
|
||||
second := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 2)
|
||||
|
||||
if _, err := scheduler.Enqueue(first); err != nil {
|
||||
t.Fatalf("enqueue first: %v", err)
|
||||
}
|
||||
_, err := scheduler.Enqueue(second)
|
||||
if !errors.Is(err, ErrSyntheticRelayQueueFull) {
|
||||
t.Fatalf("err = %v, want ErrSyntheticRelayQueueFull", err)
|
||||
}
|
||||
dequeued, err := scheduler.Dequeue()
|
||||
if err != nil {
|
||||
t.Fatalf("dequeue: %v", err)
|
||||
}
|
||||
if dequeued.Sequence != 1 {
|
||||
t.Fatalf("dequeued sequence = %d, want 1", dequeued.Sequence)
|
||||
}
|
||||
metrics := scheduler.SnapshotQueueMetrics()
|
||||
if metrics.Dropped != 0 || metrics.Rejected != 1 {
|
||||
t.Fatalf("metrics = %+v, want no drop and one rejection", metrics)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRelaySchedulerRejectsInvalidEnvelopes(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
mutate func(*SyntheticEnvelope)
|
||||
want error
|
||||
}{
|
||||
{
|
||||
name: "wrong cluster",
|
||||
mutate: func(envelope *SyntheticEnvelope) {
|
||||
envelope.ClusterID = "cluster-2"
|
||||
},
|
||||
want: ErrClusterMismatch,
|
||||
},
|
||||
{
|
||||
name: "wrong node",
|
||||
mutate: func(envelope *SyntheticEnvelope) {
|
||||
envelope.To.NodeID = "node-x"
|
||||
},
|
||||
want: ErrNodeMismatch,
|
||||
},
|
||||
{
|
||||
name: "unauthorized channel",
|
||||
mutate: func(envelope *SyntheticEnvelope) {
|
||||
envelope.Channel = "rdp_render"
|
||||
},
|
||||
want: ErrUnauthorizedChannel,
|
||||
},
|
||||
{
|
||||
name: "unsupported message",
|
||||
mutate: func(envelope *SyntheticEnvelope) {
|
||||
envelope.MessageType = "rdp.input"
|
||||
},
|
||||
want: ErrUnsupportedSyntheticMessage,
|
||||
},
|
||||
{
|
||||
name: "ttl exhausted",
|
||||
mutate: func(envelope *SyntheticEnvelope) {
|
||||
envelope.TTL = 0
|
||||
},
|
||||
want: ErrTTLExhausted,
|
||||
},
|
||||
{
|
||||
name: "loop detected",
|
||||
mutate: func(envelope *SyntheticEnvelope) {
|
||||
envelope.Visited = append(envelope.Visited, "node-r")
|
||||
},
|
||||
want: ErrLoopDetected,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
scheduler := testRelayScheduler()
|
||||
envelope := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1)
|
||||
tt.mutate(&envelope)
|
||||
_, err := scheduler.Enqueue(envelope)
|
||||
if !errors.Is(err, tt.want) {
|
||||
t.Fatalf("err = %v, want %v", err, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRelaySchedulerDisabledRejects(t *testing.T) {
|
||||
scheduler := NewSyntheticRelayScheduler(SyntheticRelaySchedulerConfig{
|
||||
Enabled: false,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"},
|
||||
})
|
||||
_, err := scheduler.Enqueue(testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1))
|
||||
if !errors.Is(err, ErrMeshRuntimeDisabled) {
|
||||
t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err)
|
||||
}
|
||||
if _, err := scheduler.Dequeue(); !errors.Is(err, ErrMeshRuntimeDisabled) {
|
||||
t.Fatalf("dequeue err = %v, want ErrMeshRuntimeDisabled", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRelaySchedulerQueueDepthSnapshot(t *testing.T) {
|
||||
scheduler := testRelayScheduler()
|
||||
if _, err := scheduler.Enqueue(testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1)); err != nil {
|
||||
t.Fatalf("enqueue fabric control: %v", err)
|
||||
}
|
||||
if _, err := scheduler.Enqueue(testRelayEnvelope(SyntheticChannelRouteControl, SyntheticMessageRouteHealth, 2)); err != nil {
|
||||
t.Fatalf("enqueue route control: %v", err)
|
||||
}
|
||||
metrics := scheduler.SnapshotQueueMetrics()
|
||||
if metrics.QueueDepths[SyntheticChannelFabricControl] != 1 {
|
||||
t.Fatalf("fabric_control depth = %d, want 1", metrics.QueueDepths[SyntheticChannelFabricControl])
|
||||
}
|
||||
if metrics.QueueDepths[SyntheticChannelRouteControl] != 1 {
|
||||
t.Fatalf("route_control depth = %d, want 1", metrics.QueueDepths[SyntheticChannelRouteControl])
|
||||
}
|
||||
}
|
||||
|
||||
func testRelayScheduler() *SyntheticRelayScheduler {
|
||||
return NewSyntheticRelayScheduler(SyntheticRelaySchedulerConfig{
|
||||
Enabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"},
|
||||
QueuePolicies: []SyntheticRelayQueuePolicy{
|
||||
{Channel: SyntheticChannelFabricControl, Capacity: 1, Droppable: false},
|
||||
{Channel: SyntheticChannelRouteControl, Capacity: 1, Droppable: false},
|
||||
{Channel: SyntheticChannelTelemetry, Capacity: 1, Droppable: true},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
func testRelayEnvelope(channel string, messageType string, sequence uint64) SyntheticEnvelope {
|
||||
route := testRoute("route-relay-scheduler", []string{"node-a", "node-r", "node-b"})
|
||||
envelope := testEnvelope(route, "node-a", "node-r")
|
||||
envelope.Channel = channel
|
||||
envelope.MessageType = messageType
|
||||
envelope.Sequence = sequence
|
||||
return envelope
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,432 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
type syntheticTestTransport struct {
|
||||
nodes map[string]*SyntheticRuntime
|
||||
}
|
||||
|
||||
func (t syntheticTestTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
|
||||
next := t.nodes[nextNodeID]
|
||||
if next == nil {
|
||||
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
|
||||
}
|
||||
return next.Receive(ctx, envelope)
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeDirectProbe(t *testing.T) {
|
||||
route := testRoute("route-direct", []string{"node-a", "node-b"})
|
||||
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
|
||||
nodeA := testRuntime("node-a", transport, route)
|
||||
nodeB := testRuntime("node-b", transport, route)
|
||||
transport.nodes["node-a"] = nodeA
|
||||
transport.nodes["node-b"] = nodeB
|
||||
|
||||
ack, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-direct")
|
||||
if err != nil {
|
||||
t.Fatalf("send probe: %v", err)
|
||||
}
|
||||
if ack.MessageType != SyntheticMessageProbeAck {
|
||||
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
|
||||
}
|
||||
if ack.From.NodeID != "node-b" || ack.To.NodeID != "node-a" {
|
||||
t.Fatalf("unexpected ack peers: from=%+v to=%+v", ack.From, ack.To)
|
||||
}
|
||||
payload := decodeAckPayload(t, ack)
|
||||
if len(payload.Path) != 2 || payload.Path[0] != "node-a" || payload.Path[1] != "node-b" {
|
||||
t.Fatalf("Path = %#v, want node-a -> node-b", payload.Path)
|
||||
}
|
||||
if nodeB.SnapshotMetrics().ProbeAcksCreated != 1 {
|
||||
t.Fatalf("ProbeAcksCreated = %d, want 1", nodeB.SnapshotMetrics().ProbeAcksCreated)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeSingleRelayProbe(t *testing.T) {
|
||||
route := testRoute("route-relay", []string{"node-a", "node-r", "node-b"})
|
||||
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
|
||||
nodeA := testRuntime("node-a", transport, route)
|
||||
nodeR := testRuntime("node-r", transport, route)
|
||||
nodeB := testRuntime("node-b", transport, route)
|
||||
transport.nodes["node-a"] = nodeA
|
||||
transport.nodes["node-r"] = nodeR
|
||||
transport.nodes["node-b"] = nodeB
|
||||
|
||||
ack, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-relay")
|
||||
if err != nil {
|
||||
t.Fatalf("send probe: %v", err)
|
||||
}
|
||||
payload := decodeAckPayload(t, ack)
|
||||
if len(payload.Path) != 3 || payload.Path[0] != "node-a" || payload.Path[1] != "node-r" || payload.Path[2] != "node-b" {
|
||||
t.Fatalf("Path = %#v, want node-a -> node-r -> node-b", payload.Path)
|
||||
}
|
||||
if nodeR.SnapshotMetrics().ProbesForwarded != 1 {
|
||||
t.Fatalf("ProbesForwarded = %d, want 1", nodeR.SnapshotMetrics().ProbesForwarded)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeDisabledRejectsProbe(t *testing.T) {
|
||||
route := testRoute("route-disabled", []string{"node-a", "node-b"})
|
||||
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
|
||||
Enabled: false,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
Routes: []SyntheticRoute{route},
|
||||
})
|
||||
_, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-disabled")
|
||||
if !errors.Is(err, ErrMeshRuntimeDisabled) {
|
||||
t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeRejectsWrongCluster(t *testing.T) {
|
||||
route := testRoute("route-wrong-cluster", []string{"node-a", "node-b"})
|
||||
nodeB := testRuntime("node-b", syntheticTestTransport{}, route)
|
||||
envelope := testEnvelope(route, "node-a", "node-b")
|
||||
envelope.ClusterID = "cluster-2"
|
||||
|
||||
_, err := nodeB.Receive(context.Background(), envelope)
|
||||
if !errors.Is(err, ErrClusterMismatch) {
|
||||
t.Fatalf("err = %v, want ErrClusterMismatch", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeRejectsWrongNode(t *testing.T) {
|
||||
route := testRoute("route-wrong-node", []string{"node-a", "node-b"})
|
||||
nodeB := testRuntime("node-b", syntheticTestTransport{}, route)
|
||||
envelope := testEnvelope(route, "node-a", "node-c")
|
||||
|
||||
_, err := nodeB.Receive(context.Background(), envelope)
|
||||
if !errors.Is(err, ErrNodeMismatch) {
|
||||
t.Fatalf("err = %v, want ErrNodeMismatch", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeRejectsUnauthorizedChannel(t *testing.T) {
|
||||
route := testRoute("route-unauthorized", []string{"node-a", "node-b"})
|
||||
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
|
||||
|
||||
_, err := nodeA.SendProbe(context.Background(), route.RouteID, "rdp_render", "probe-unauthorized")
|
||||
if !errors.Is(err, ErrUnauthorizedChannel) {
|
||||
t.Fatalf("err = %v, want ErrUnauthorizedChannel", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeRejectsExpiredRoute(t *testing.T) {
|
||||
route := testRoute("route-expired", []string{"node-a", "node-b"})
|
||||
route.ExpiresAt = time.Now().UTC().Add(-time.Minute)
|
||||
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
|
||||
|
||||
_, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-expired")
|
||||
if !errors.Is(err, ErrRouteExpired) {
|
||||
t.Fatalf("err = %v, want ErrRouteExpired", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeRejectsTTLExhaustion(t *testing.T) {
|
||||
route := testRoute("route-ttl", []string{"node-a", "node-r", "node-b"})
|
||||
route.MaxTTL = 1
|
||||
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
|
||||
nodeA := testRuntime("node-a", transport, route)
|
||||
nodeR := testRuntime("node-r", transport, route)
|
||||
transport.nodes["node-a"] = nodeA
|
||||
transport.nodes["node-r"] = nodeR
|
||||
|
||||
_, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-ttl")
|
||||
if !errors.Is(err, ErrTTLExhausted) {
|
||||
t.Fatalf("err = %v, want ErrTTLExhausted", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeRejectsLoop(t *testing.T) {
|
||||
route := testRoute("route-loop", []string{"node-a", "node-b"})
|
||||
nodeB := testRuntime("node-b", syntheticTestTransport{}, route)
|
||||
envelope := testEnvelope(route, "node-a", "node-b")
|
||||
envelope.Visited = []string{"node-a", "node-b"}
|
||||
|
||||
_, err := nodeB.Receive(context.Background(), envelope)
|
||||
if !errors.Is(err, ErrLoopDetected) {
|
||||
t.Fatalf("err = %v, want ErrLoopDetected", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeRejectsUnavailablePeer(t *testing.T) {
|
||||
route := testRoute("route-missing-peer", []string{"node-a", "node-b"})
|
||||
nodeA := testRuntime("node-a", syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}, route)
|
||||
|
||||
_, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-missing-peer")
|
||||
if !errors.Is(err, ErrSyntheticPeerUnavailable) {
|
||||
t.Fatalf("err = %v, want ErrSyntheticPeerUnavailable", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeRouteHealthProbeRecordsSuccess(t *testing.T) {
|
||||
route := testRoute("route-health", []string{"node-a", "node-b"})
|
||||
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
|
||||
nodeA := testRuntime("node-a", transport, route)
|
||||
nodeB := testRuntime("node-b", transport, route)
|
||||
transport.nodes["node-a"] = nodeA
|
||||
transport.nodes["node-b"] = nodeB
|
||||
|
||||
result, err := nodeA.SendRouteHealthProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-health")
|
||||
if err != nil {
|
||||
t.Fatalf("send route health probe: %v", err)
|
||||
}
|
||||
if result.Ack.MessageType != SyntheticMessageRouteHealthAck {
|
||||
t.Fatalf("MessageType = %q, want %q", result.Ack.MessageType, SyntheticMessageRouteHealthAck)
|
||||
}
|
||||
if result.FallbackUsed {
|
||||
t.Fatal("FallbackUsed = true, want false")
|
||||
}
|
||||
observation, ok := nodeA.SnapshotRouteObservation(route.RouteID)
|
||||
if !ok {
|
||||
t.Fatal("route observation missing")
|
||||
}
|
||||
if observation.State != SyntheticRouteStateHealthy || observation.SuccessCount != 1 {
|
||||
t.Fatalf("observation = %+v, want healthy success", observation)
|
||||
}
|
||||
if observation.PolicyVersion != "policy-v1" || observation.PeerDirectoryVersion != "peers-v1" || observation.RouteVersion != "route-v1" {
|
||||
t.Fatalf("observation versions = %+v", observation)
|
||||
}
|
||||
metrics := nodeA.SnapshotMetrics()
|
||||
if metrics.RouteHealthProbesSent != 1 || metrics.RouteDeliveriesSucceeded != 1 {
|
||||
t.Fatalf("metrics = %+v, want health probe success", metrics)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeRouteHealthUsesDedicatedRouteConfig(t *testing.T) {
|
||||
base := testRoute("route-effective-health", []string{"node-a", "node-old", "node-b"})
|
||||
effective := testRoute("route-effective-health", []string{"node-a", "node-new", "node-b"})
|
||||
effective.RouteVersion = "decision-v1"
|
||||
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
|
||||
nodeA := testRuntimeWithRouteHealth("node-a", transport, []SyntheticRoute{base}, []SyntheticRoute{effective})
|
||||
nodeOld := testRuntimeWithRouteHealth("node-old", transport, []SyntheticRoute{base}, []SyntheticRoute{effective})
|
||||
nodeNew := testRuntimeWithRouteHealth("node-new", transport, []SyntheticRoute{base}, []SyntheticRoute{effective})
|
||||
nodeB := testRuntimeWithRouteHealth("node-b", transport, []SyntheticRoute{base}, []SyntheticRoute{effective})
|
||||
transport.nodes["node-a"] = nodeA
|
||||
transport.nodes["node-old"] = nodeOld
|
||||
transport.nodes["node-new"] = nodeNew
|
||||
transport.nodes["node-b"] = nodeB
|
||||
|
||||
health, err := nodeA.SendRouteHealthProbe(context.Background(), base.RouteID, SyntheticChannelFabricControl, "probe-health-effective")
|
||||
if err != nil {
|
||||
t.Fatalf("send route health probe: %v", err)
|
||||
}
|
||||
healthPayload := decodeAckPayload(t, health.Ack)
|
||||
if got, want := healthPayload.Path, []string{"node-a", "node-new", "node-b"}; !sameStrings(got, want) {
|
||||
t.Fatalf("route health path = %v, want %v", got, want)
|
||||
}
|
||||
if nodeNew.SnapshotMetrics().ProbesForwarded != 1 {
|
||||
t.Fatalf("node-new forwarded = %d, want 1", nodeNew.SnapshotMetrics().ProbesForwarded)
|
||||
}
|
||||
if nodeOld.SnapshotMetrics().ProbesForwarded != 0 {
|
||||
t.Fatalf("node-old forwarded = %d, want 0 before regular probe", nodeOld.SnapshotMetrics().ProbesForwarded)
|
||||
}
|
||||
observation, ok := nodeA.SnapshotRouteObservation(base.RouteID)
|
||||
if !ok || observation.RouteVersion != "decision-v1" {
|
||||
t.Fatalf("route health observation = %+v, want decision route version", observation)
|
||||
}
|
||||
|
||||
probe, err := nodeA.SendProbe(context.Background(), base.RouteID, SyntheticChannelFabricControl, "probe-regular")
|
||||
if err != nil {
|
||||
t.Fatalf("send regular probe: %v", err)
|
||||
}
|
||||
probePayload := decodeAckPayload(t, probe)
|
||||
if got, want := probePayload.Path, []string{"node-a", "node-old", "node-b"}; !sameStrings(got, want) {
|
||||
t.Fatalf("regular probe path = %v, want %v", got, want)
|
||||
}
|
||||
if nodeOld.SnapshotMetrics().ProbesForwarded != 1 {
|
||||
t.Fatalf("node-old forwarded = %d, want 1 after regular probe", nodeOld.SnapshotMetrics().ProbesForwarded)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeRouteHealthUsesFallbackWhenPreferredUnavailable(t *testing.T) {
|
||||
preferred := testRoute("route-preferred", []string{"node-a", "node-r", "node-b"})
|
||||
fallback := testRoute("route-fallback", []string{"node-a", "node-b"})
|
||||
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
|
||||
nodeA := testRuntime("node-a", transport, preferred, fallback)
|
||||
nodeB := testRuntime("node-b", transport, preferred, fallback)
|
||||
transport.nodes["node-a"] = nodeA
|
||||
transport.nodes["node-b"] = nodeB
|
||||
|
||||
result, err := nodeA.SendRouteHealthProbeWithFallback(
|
||||
context.Background(),
|
||||
preferred.RouteID,
|
||||
[]string{fallback.RouteID},
|
||||
SyntheticChannelFabricControl,
|
||||
"probe-fallback",
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("send route health probe with fallback: %v", err)
|
||||
}
|
||||
if !result.FallbackUsed {
|
||||
t.Fatal("FallbackUsed = false, want true")
|
||||
}
|
||||
if result.SelectedRouteID != fallback.RouteID {
|
||||
t.Fatalf("SelectedRouteID = %q, want %q", result.SelectedRouteID, fallback.RouteID)
|
||||
}
|
||||
preferredObservation, ok := nodeA.SnapshotRouteObservation(preferred.RouteID)
|
||||
if !ok {
|
||||
t.Fatal("preferred route observation missing")
|
||||
}
|
||||
if preferredObservation.State != SyntheticRouteStateFailed || preferredObservation.FailureCount != 1 {
|
||||
t.Fatalf("preferred observation = %+v, want failed", preferredObservation)
|
||||
}
|
||||
fallbackObservation, ok := nodeA.SnapshotRouteObservation(fallback.RouteID)
|
||||
if !ok {
|
||||
t.Fatal("fallback route observation missing")
|
||||
}
|
||||
if fallbackObservation.State != SyntheticRouteStateHealthy || fallbackObservation.SuccessCount != 1 {
|
||||
t.Fatalf("fallback observation = %+v, want healthy", fallbackObservation)
|
||||
}
|
||||
metrics := nodeA.SnapshotMetrics()
|
||||
if metrics.FallbackRoutesUsed != 1 || metrics.WarmRoutesPromoted != 1 || metrics.RouteDeliveriesFailed != 1 {
|
||||
t.Fatalf("metrics = %+v, want fallback promotion and one failed delivery", metrics)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeRouteCacheInvalidatesOnVersionChange(t *testing.T) {
|
||||
route := testRoute("route-cache", []string{"node-a", "node-b"})
|
||||
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
|
||||
nodeA := testRuntime("node-a", transport, route)
|
||||
nodeB := testRuntime("node-b", transport, route)
|
||||
transport.nodes["node-a"] = nodeA
|
||||
transport.nodes["node-b"] = nodeB
|
||||
|
||||
if _, err := nodeA.SendRouteHealthProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-cache"); err != nil {
|
||||
t.Fatalf("send route health probe: %v", err)
|
||||
}
|
||||
if _, ok := nodeA.SnapshotRouteObservation(route.RouteID); !ok {
|
||||
t.Fatal("route observation missing before invalidation")
|
||||
}
|
||||
|
||||
invalidated := nodeA.InvalidateRouteCache("policy_changed", SyntheticRouteCacheVersion{PolicyVersion: "policy-v2"})
|
||||
if invalidated != 1 {
|
||||
t.Fatalf("invalidated = %d, want 1", invalidated)
|
||||
}
|
||||
if _, ok := nodeA.SnapshotRouteObservation(route.RouteID); ok {
|
||||
t.Fatal("route observation still present after invalidation")
|
||||
}
|
||||
if nodeA.SnapshotMetrics().RouteCacheInvalidations != 1 {
|
||||
t.Fatalf("RouteCacheInvalidations = %d, want 1", nodeA.SnapshotMetrics().RouteCacheInvalidations)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeRouteCacheKeepsCurrentVersion(t *testing.T) {
|
||||
route := testRoute("route-cache-current", []string{"node-a", "node-b"})
|
||||
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
|
||||
nodeA := testRuntime("node-a", transport, route)
|
||||
nodeB := testRuntime("node-b", transport, route)
|
||||
transport.nodes["node-a"] = nodeA
|
||||
transport.nodes["node-b"] = nodeB
|
||||
|
||||
if _, err := nodeA.SendRouteHealthProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-cache-current"); err != nil {
|
||||
t.Fatalf("send route health probe: %v", err)
|
||||
}
|
||||
invalidated := nodeA.InvalidateRouteCache("same_versions", SyntheticRouteCacheVersion{
|
||||
RouteVersion: "route-v1",
|
||||
PolicyVersion: "policy-v1",
|
||||
PeerDirectoryVersion: "peers-v1",
|
||||
})
|
||||
if invalidated != 0 {
|
||||
t.Fatalf("invalidated = %d, want 0", invalidated)
|
||||
}
|
||||
if _, ok := nodeA.SnapshotRouteObservation(route.RouteID); !ok {
|
||||
t.Fatal("route observation missing after same-version invalidation")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeRouteHealthDisabledRejects(t *testing.T) {
|
||||
route := testRoute("route-health-disabled", []string{"node-a", "node-b"})
|
||||
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
|
||||
Enabled: false,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
Routes: []SyntheticRoute{route},
|
||||
})
|
||||
|
||||
_, err := nodeA.SendRouteHealthProbeWithFallback(
|
||||
context.Background(),
|
||||
route.RouteID,
|
||||
[]string{"route-fallback"},
|
||||
SyntheticChannelFabricControl,
|
||||
"probe-disabled-health",
|
||||
)
|
||||
if !errors.Is(err, ErrMeshRuntimeDisabled) {
|
||||
t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err)
|
||||
}
|
||||
}
|
||||
|
||||
func testRuntime(nodeID string, transport SyntheticTransport, routes ...SyntheticRoute) *SyntheticRuntime {
|
||||
return NewSyntheticRuntime(SyntheticRuntimeConfig{
|
||||
Enabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: nodeID},
|
||||
Routes: routes,
|
||||
Transport: transport,
|
||||
MaxTTL: 8,
|
||||
MaxHops: 8,
|
||||
})
|
||||
}
|
||||
|
||||
func testRuntimeWithRouteHealth(nodeID string, transport SyntheticTransport, routes []SyntheticRoute, routeHealthRoutes []SyntheticRoute) *SyntheticRuntime {
|
||||
return NewSyntheticRuntime(SyntheticRuntimeConfig{
|
||||
Enabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: nodeID},
|
||||
Routes: routes,
|
||||
RouteHealthRoutes: routeHealthRoutes,
|
||||
Transport: transport,
|
||||
MaxTTL: 8,
|
||||
MaxHops: 8,
|
||||
})
|
||||
}
|
||||
|
||||
func testRoute(routeID string, hops []string) SyntheticRoute {
|
||||
return SyntheticRoute{
|
||||
RouteID: routeID,
|
||||
ClusterID: "cluster-1",
|
||||
SourceNodeID: hops[0],
|
||||
DestinationNodeID: hops[len(hops)-1],
|
||||
Hops: hops,
|
||||
AllowedChannels: []string{SyntheticChannelFabricControl},
|
||||
ExpiresAt: time.Now().UTC().Add(time.Hour),
|
||||
MaxTTL: 8,
|
||||
MaxHops: 8,
|
||||
RouteVersion: "route-v1",
|
||||
PolicyVersion: "policy-v1",
|
||||
PeerDirectoryVersion: "peers-v1",
|
||||
}
|
||||
}
|
||||
|
||||
func testEnvelope(route SyntheticRoute, fromNodeID string, toNodeID string) SyntheticEnvelope {
|
||||
payload, _ := json.Marshal(SyntheticProbePayload{
|
||||
ProbeID: "probe-test",
|
||||
SentAt: time.Now().UTC(),
|
||||
})
|
||||
return SyntheticEnvelope{
|
||||
ProtocolVersion: ProtocolVersion,
|
||||
RouteID: route.RouteID,
|
||||
ClusterID: route.ClusterID,
|
||||
From: PeerIdentity{ClusterID: route.ClusterID, NodeID: fromNodeID},
|
||||
To: PeerIdentity{ClusterID: route.ClusterID, NodeID: toNodeID},
|
||||
Channel: SyntheticChannelFabricControl,
|
||||
MessageType: SyntheticMessageProbe,
|
||||
TTL: 8,
|
||||
HopCount: 1,
|
||||
Visited: []string{fromNodeID},
|
||||
Sequence: 1,
|
||||
SentAt: time.Now().UTC(),
|
||||
Payload: payload,
|
||||
}
|
||||
}
|
||||
|
||||
func decodeAckPayload(t *testing.T, envelope SyntheticEnvelope) SyntheticProbeAckPayload {
|
||||
t.Helper()
|
||||
var payload SyntheticProbeAckPayload
|
||||
if err := json.Unmarshal(envelope.Payload, &payload); err != nil {
|
||||
t.Fatalf("decode ack payload: %v", err)
|
||||
}
|
||||
return payload
|
||||
}
|
||||
@@ -0,0 +1,235 @@
|
||||
package mesh
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestSyntheticRuntimeTestServiceDirectRoute(t *testing.T) {
|
||||
route := testServiceRoute("route-test-service-direct", []string{"node-a", "node-b"})
|
||||
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
|
||||
nodeA := testRuntime("node-a", transport, route)
|
||||
nodeB := testRuntime("node-b", transport, route)
|
||||
transport.nodes["node-a"] = nodeA
|
||||
transport.nodes["node-b"] = nodeB
|
||||
|
||||
result, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-direct", "hello"))
|
||||
if err != nil {
|
||||
t.Fatalf("send test service: %v", err)
|
||||
}
|
||||
if result.Ack.MessageType != SyntheticMessageTestServiceAck {
|
||||
t.Fatalf("MessageType = %q, want %q", result.Ack.MessageType, SyntheticMessageTestServiceAck)
|
||||
}
|
||||
if result.Response.EchoPayload != "hello" {
|
||||
t.Fatalf("EchoPayload = %q, want hello", result.Response.EchoPayload)
|
||||
}
|
||||
if len(result.Response.Path) != 2 || result.Response.Path[0] != "node-a" || result.Response.Path[1] != "node-b" {
|
||||
t.Fatalf("Path = %#v, want node-a -> node-b", result.Response.Path)
|
||||
}
|
||||
metrics := nodeA.SnapshotMetrics()
|
||||
if metrics.TestServiceRequestsSent != 1 || metrics.TestServiceDeliveriesSucceeded != 1 {
|
||||
t.Fatalf("metrics = %+v, want one test service success", metrics)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeTestServiceSingleRelayRoute(t *testing.T) {
|
||||
route := testServiceRoute("route-test-service-relay", []string{"node-a", "node-r", "node-b"})
|
||||
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
|
||||
nodeA := testRuntime("node-a", transport, route)
|
||||
nodeR := testRuntime("node-r", transport, route)
|
||||
nodeB := testRuntime("node-b", transport, route)
|
||||
transport.nodes["node-a"] = nodeA
|
||||
transport.nodes["node-r"] = nodeR
|
||||
transport.nodes["node-b"] = nodeB
|
||||
|
||||
result, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-relay", "relay"))
|
||||
if err != nil {
|
||||
t.Fatalf("send test service: %v", err)
|
||||
}
|
||||
if len(result.Response.Path) != 3 || result.Response.Path[0] != "node-a" || result.Response.Path[1] != "node-r" || result.Response.Path[2] != "node-b" {
|
||||
t.Fatalf("Path = %#v, want node-a -> node-r -> node-b", result.Response.Path)
|
||||
}
|
||||
if nodeR.SnapshotMetrics().ProbesForwarded != 1 {
|
||||
t.Fatalf("ProbesForwarded = %d, want 1", nodeR.SnapshotMetrics().ProbesForwarded)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeTestServiceUsesForcedFallback(t *testing.T) {
|
||||
preferred := testServiceRoute("route-test-service-preferred", []string{"node-a", "node-r", "node-b"})
|
||||
fallback := testServiceRoute("route-test-service-fallback", []string{"node-a", "node-b"})
|
||||
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
|
||||
nodeA := testRuntime("node-a", transport, preferred, fallback)
|
||||
nodeB := testRuntime("node-b", transport, preferred, fallback)
|
||||
transport.nodes["node-a"] = nodeA
|
||||
transport.nodes["node-b"] = nodeB
|
||||
|
||||
result, err := nodeA.SendTestServiceWithFallback(
|
||||
context.Background(),
|
||||
preferred.RouteID,
|
||||
[]string{fallback.RouteID},
|
||||
SyntheticChannelRouteControl,
|
||||
testServiceRequest("request-fallback", "fallback"),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("send test service with fallback: %v", err)
|
||||
}
|
||||
if !result.FallbackUsed {
|
||||
t.Fatal("FallbackUsed = false, want true")
|
||||
}
|
||||
if result.SelectedRouteID != fallback.RouteID {
|
||||
t.Fatalf("SelectedRouteID = %q, want %q", result.SelectedRouteID, fallback.RouteID)
|
||||
}
|
||||
if result.Response.EchoPayload != "fallback" {
|
||||
t.Fatalf("EchoPayload = %q, want fallback", result.Response.EchoPayload)
|
||||
}
|
||||
metrics := nodeA.SnapshotMetrics()
|
||||
if metrics.TestServiceFallbacksUsed != 1 || metrics.TestServiceDeliveriesFailed != 1 || metrics.TestServiceDeliveriesSucceeded != 1 {
|
||||
t.Fatalf("metrics = %+v, want fallback success with one preferred failure", metrics)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeTestServiceRejectsWrongOrganization(t *testing.T) {
|
||||
route := testServiceRoute("route-test-service-wrong-org", []string{"node-a", "node-b"})
|
||||
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
|
||||
request := testServiceRequest("request-wrong-org", "hello")
|
||||
request.OrganizationID = "org-other"
|
||||
|
||||
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, request)
|
||||
if !errors.Is(err, ErrSyntheticOrganizationMismatch) {
|
||||
t.Fatalf("err = %v, want ErrSyntheticOrganizationMismatch", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeTestServiceRejectsUnsupportedService(t *testing.T) {
|
||||
route := testServiceRoute("route-test-service-unsupported", []string{"node-a", "node-b"})
|
||||
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
|
||||
request := testServiceRequest("request-unsupported", "hello")
|
||||
request.ServiceType = "rdp"
|
||||
|
||||
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, request)
|
||||
if !errors.Is(err, ErrUnsupportedSyntheticService) {
|
||||
t.Fatalf("err = %v, want ErrUnsupportedSyntheticService", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeTestServiceRejectsOversizedPayload(t *testing.T) {
|
||||
route := testServiceRoute("route-test-service-oversized", []string{"node-a", "node-b"})
|
||||
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
|
||||
Enabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
Routes: []SyntheticRoute{route},
|
||||
MaxTestPayloadBytes: 4,
|
||||
})
|
||||
|
||||
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-oversized", "12345"))
|
||||
if !errors.Is(err, ErrSyntheticPayloadTooLarge) {
|
||||
t.Fatalf("err = %v, want ErrSyntheticPayloadTooLarge", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeTestServiceRejectsUnauthorizedChannel(t *testing.T) {
|
||||
route := testServiceRoute("route-test-service-channel", []string{"node-a", "node-b"})
|
||||
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
|
||||
|
||||
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelFabricControl, testServiceRequest("request-channel", "hello"))
|
||||
if !errors.Is(err, ErrUnauthorizedChannel) {
|
||||
t.Fatalf("err = %v, want ErrUnauthorizedChannel", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeTestServiceDisabledRejects(t *testing.T) {
|
||||
route := testServiceRoute("route-test-service-disabled", []string{"node-a", "node-b"})
|
||||
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
|
||||
Enabled: false,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
Routes: []SyntheticRoute{route},
|
||||
})
|
||||
|
||||
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-disabled", "hello"))
|
||||
if !errors.Is(err, ErrMeshRuntimeDisabled) {
|
||||
t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRelaySchedulerAcceptsTestServiceMessage(t *testing.T) {
|
||||
scheduler := testRelayScheduler()
|
||||
envelope := testRelayEnvelope(SyntheticChannelRouteControl, SyntheticMessageTestService, 42)
|
||||
envelope.Payload = mustMarshalTestServiceRequest(testServiceRequest("request-relay-scheduler", "hello"))
|
||||
|
||||
if _, err := scheduler.Enqueue(envelope); err != nil {
|
||||
t.Fatalf("enqueue test service: %v", err)
|
||||
}
|
||||
dequeued, err := scheduler.Dequeue()
|
||||
if err != nil {
|
||||
t.Fatalf("dequeue test service: %v", err)
|
||||
}
|
||||
if dequeued.MessageType != SyntheticMessageTestService {
|
||||
t.Fatalf("MessageType = %q, want %q", dequeued.MessageType, SyntheticMessageTestService)
|
||||
}
|
||||
}
|
||||
|
||||
func testServiceRoute(routeID string, hops []string) SyntheticRoute {
|
||||
route := testRoute(routeID, hops)
|
||||
route.AllowedChannels = []string{SyntheticChannelRouteControl}
|
||||
return route
|
||||
}
|
||||
|
||||
func testServiceRequest(requestID string, payload string) SyntheticTestServiceRequest {
|
||||
return SyntheticTestServiceRequest{
|
||||
RequestID: requestID,
|
||||
OrganizationID: SyntheticDefaultTestOrganizationID,
|
||||
ServiceType: SyntheticTestServiceType,
|
||||
Payload: payload,
|
||||
}
|
||||
}
|
||||
|
||||
func mustMarshalTestServiceRequest(request SyntheticTestServiceRequest) []byte {
|
||||
payload, err := json.Marshal(request)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return payload
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeTestServiceRejectsMissingRequestID(t *testing.T) {
|
||||
route := testServiceRoute("route-test-service-missing-request", []string{"node-a", "node-b"})
|
||||
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
|
||||
request := testServiceRequest("", "hello")
|
||||
|
||||
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, request)
|
||||
if !errors.Is(err, ErrSyntheticRequestInvalid) {
|
||||
t.Fatalf("err = %v, want ErrSyntheticRequestInvalid", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyntheticRuntimeTestServiceAllowsMaxPayloadBoundary(t *testing.T) {
|
||||
route := testServiceRoute("route-test-service-max", []string{"node-a", "node-b"})
|
||||
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
|
||||
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
|
||||
Enabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
|
||||
Routes: []SyntheticRoute{route},
|
||||
Transport: transport,
|
||||
MaxTestPayloadBytes: 8,
|
||||
})
|
||||
nodeB := NewSyntheticRuntime(SyntheticRuntimeConfig{
|
||||
Enabled: true,
|
||||
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
|
||||
Routes: []SyntheticRoute{route},
|
||||
Transport: transport,
|
||||
MaxTestPayloadBytes: 8,
|
||||
})
|
||||
transport.nodes["node-a"] = nodeA
|
||||
transport.nodes["node-b"] = nodeB
|
||||
|
||||
result, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-max", strings.Repeat("a", 8)))
|
||||
if err != nil {
|
||||
t.Fatalf("send test service: %v", err)
|
||||
}
|
||||
if result.Response.EchoPayload != strings.Repeat("a", 8) {
|
||||
t.Fatalf("EchoPayload = %q", result.Response.EchoPayload)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package relay
|
||||
|
||||
import "errors"
|
||||
|
||||
var ErrProductionForwardingDisabled = errors.New("relay skeleton does not forward production payloads before an approved production mesh stage")
|
||||
|
||||
type Skeleton struct {
|
||||
ClusterID string
|
||||
NodeID string
|
||||
}
|
||||
|
||||
func (s Skeleton) AcceptControlConnection() bool {
|
||||
return s.ClusterID != "" && s.NodeID != ""
|
||||
}
|
||||
|
||||
func (s Skeleton) ForwardProductionPayload([]byte) error {
|
||||
return ErrProductionForwardingDisabled
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
package relay
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestRelaySkeletonAcceptsControlButNotPayloadForwarding(t *testing.T) {
|
||||
relay := Skeleton{ClusterID: "cluster-1", NodeID: "node-relay"}
|
||||
if !relay.AcceptControlConnection() {
|
||||
t.Fatal("relay skeleton should accept control connection metadata")
|
||||
}
|
||||
if err := relay.ForwardProductionPayload([]byte("rdp")); !errors.Is(err, ErrProductionForwardingDisabled) {
|
||||
t.Fatalf("err = %v, want ErrProductionForwardingDisabled", err)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
package state
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
)
|
||||
|
||||
const FileName = "identity.json"
|
||||
|
||||
type Identity struct {
|
||||
NodeID string `json:"node_id"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
NodeName string `json:"node_name"`
|
||||
NodeFingerprint string `json:"node_fingerprint"`
|
||||
PublicKey string `json:"public_key"`
|
||||
IdentityStatus string `json:"identity_status"`
|
||||
PendingJoinRequestID string `json:"pending_join_request_id,omitempty"`
|
||||
ClusterAuthorityPublicKey string `json:"cluster_authority_public_key,omitempty"`
|
||||
ClusterAuthorityFingerprint string `json:"cluster_authority_fingerprint,omitempty"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
func LoadOrCreate(dir, clusterID, nodeName string) (Identity, error) {
|
||||
path := filepath.Join(dir, FileName)
|
||||
existing, err := Load(path)
|
||||
if err == nil {
|
||||
return existing, nil
|
||||
}
|
||||
if !errors.Is(err, os.ErrNotExist) {
|
||||
return Identity{}, err
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
fingerprint, err := randomToken("rap-node-fp")
|
||||
if err != nil {
|
||||
return Identity{}, err
|
||||
}
|
||||
publicKey, err := randomToken("rap-node-pub")
|
||||
if err != nil {
|
||||
return Identity{}, err
|
||||
}
|
||||
identity := Identity{
|
||||
ClusterID: clusterID,
|
||||
NodeName: nodeName,
|
||||
NodeFingerprint: fingerprint,
|
||||
PublicKey: publicKey,
|
||||
IdentityStatus: "new",
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
}
|
||||
if err := Save(path, identity); err != nil {
|
||||
return Identity{}, err
|
||||
}
|
||||
return identity, nil
|
||||
}
|
||||
|
||||
func Load(path string) (Identity, error) {
|
||||
payload, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return Identity{}, err
|
||||
}
|
||||
var identity Identity
|
||||
if err := json.Unmarshal(payload, &identity); err != nil {
|
||||
return Identity{}, err
|
||||
}
|
||||
return identity, nil
|
||||
}
|
||||
|
||||
func Save(path string, identity Identity) error {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil {
|
||||
return err
|
||||
}
|
||||
identity.UpdatedAt = time.Now().UTC()
|
||||
payload, err := json.MarshalIndent(identity, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(path, payload, 0o600)
|
||||
}
|
||||
|
||||
func MarkEnrollmentSubmitted(dir, clusterID, joinRequestID string) (Identity, error) {
|
||||
path := filepath.Join(dir, FileName)
|
||||
identity, err := Load(path)
|
||||
if err != nil {
|
||||
return Identity{}, err
|
||||
}
|
||||
identity.ClusterID = clusterID
|
||||
identity.PendingJoinRequestID = joinRequestID
|
||||
identity.IdentityStatus = "pending_approval"
|
||||
if err := Save(path, identity); err != nil {
|
||||
return Identity{}, err
|
||||
}
|
||||
return identity, nil
|
||||
}
|
||||
|
||||
func MarkApproved(dir string, nodeID, clusterID, status string) (Identity, error) {
|
||||
return MarkApprovedWithAuthority(dir, nodeID, clusterID, status, "", "")
|
||||
}
|
||||
|
||||
func MarkApprovedWithAuthority(dir string, nodeID, clusterID, status, authorityPublicKey, authorityFingerprint string) (Identity, error) {
|
||||
path := filepath.Join(dir, FileName)
|
||||
identity, err := Load(path)
|
||||
if err != nil {
|
||||
return Identity{}, err
|
||||
}
|
||||
identity.NodeID = nodeID
|
||||
identity.ClusterID = clusterID
|
||||
identity.IdentityStatus = status
|
||||
identity.PendingJoinRequestID = ""
|
||||
identity.ClusterAuthorityPublicKey = authorityPublicKey
|
||||
identity.ClusterAuthorityFingerprint = authorityFingerprint
|
||||
if err := Save(path, identity); err != nil {
|
||||
return Identity{}, err
|
||||
}
|
||||
return identity, nil
|
||||
}
|
||||
|
||||
func randomToken(prefix string) (string, error) {
|
||||
var random [32]byte
|
||||
if _, err := rand.Read(random[:]); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return prefix + "_" + base64.RawURLEncoding.EncodeToString(random[:]), nil
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
package state
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestLoadOrCreatePersistsIdentity(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
identity, err := LoadOrCreate(dir, "cluster-1", "node-a")
|
||||
if err != nil {
|
||||
t.Fatalf("load or create: %v", err)
|
||||
}
|
||||
if identity.NodeFingerprint == "" || identity.PublicKey == "" {
|
||||
t.Fatalf("identity missing generated fields: %+v", identity)
|
||||
}
|
||||
loaded, err := Load(filepath.Join(dir, FileName))
|
||||
if err != nil {
|
||||
t.Fatalf("load identity: %v", err)
|
||||
}
|
||||
if loaded.NodeFingerprint != identity.NodeFingerprint {
|
||||
t.Fatal("identity fingerprint was not persisted")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMarkApprovedUpdatesIdentity(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
if _, err := LoadOrCreate(dir, "cluster-1", "node-a"); err != nil {
|
||||
t.Fatalf("load or create: %v", err)
|
||||
}
|
||||
if _, err := MarkEnrollmentSubmitted(dir, "cluster-1", "join-request-1"); err != nil {
|
||||
t.Fatalf("mark enrollment submitted: %v", err)
|
||||
}
|
||||
approved, err := MarkApproved(dir, "node-1", "cluster-1", "active")
|
||||
if err != nil {
|
||||
t.Fatalf("mark approved: %v", err)
|
||||
}
|
||||
if approved.NodeID != "node-1" || approved.IdentityStatus != "active" || approved.PendingJoinRequestID != "" {
|
||||
t.Fatalf("unexpected approved identity: %+v", approved)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMarkApprovedWithAuthorityPinsClusterAuthority(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
if _, err := LoadOrCreate(dir, "cluster-1", "node-a"); err != nil {
|
||||
t.Fatalf("load or create: %v", err)
|
||||
}
|
||||
approved, err := MarkApprovedWithAuthority(dir, "node-1", "cluster-1", "active", "public-key-b64", "rap-ca-ed25519-test")
|
||||
if err != nil {
|
||||
t.Fatalf("mark approved with authority: %v", err)
|
||||
}
|
||||
if approved.ClusterAuthorityPublicKey != "public-key-b64" || approved.ClusterAuthorityFingerprint != "rap-ca-ed25519-test" {
|
||||
t.Fatalf("authority pin was not persisted: %+v", approved)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
package supervisor
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
|
||||
)
|
||||
|
||||
type Supervisor interface {
|
||||
Apply(ctx context.Context, desired []client.DesiredWorkload) ([]client.WorkloadStatusRequest, error)
|
||||
}
|
||||
|
||||
type StubSupervisor struct {
|
||||
Version string
|
||||
}
|
||||
|
||||
func (s StubSupervisor) Apply(_ context.Context, desired []client.DesiredWorkload) ([]client.WorkloadStatusRequest, error) {
|
||||
statuses := make([]client.WorkloadStatusRequest, 0, len(desired))
|
||||
for _, workload := range desired {
|
||||
state := "degraded"
|
||||
if workload.DesiredState == "disabled" {
|
||||
state = "stopped"
|
||||
}
|
||||
version := workload.Version
|
||||
if version == "" {
|
||||
version = s.Version
|
||||
}
|
||||
statuses = append(statuses, client.WorkloadStatusRequest{
|
||||
ReportedState: state,
|
||||
RuntimeMode: workload.RuntimeMode,
|
||||
Version: version,
|
||||
StatusPayload: map[string]any{
|
||||
"supervisor": "stub",
|
||||
"desired_state": workload.DesiredState,
|
||||
"service_type": workload.ServiceType,
|
||||
},
|
||||
})
|
||||
}
|
||||
return statuses, nil
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
package supervisor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
|
||||
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
|
||||
)
|
||||
|
||||
func TestStubSupervisorReportsDegradedForEnabledWorkload(t *testing.T) {
|
||||
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
|
||||
{ServiceType: "rdp-worker", DesiredState: "enabled", RuntimeMode: "container"},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("apply desired workload: %v", err)
|
||||
}
|
||||
if len(statuses) != 1 {
|
||||
t.Fatalf("statuses length = %d", len(statuses))
|
||||
}
|
||||
if statuses[0].ReportedState != "degraded" {
|
||||
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStubSupervisorReportsStoppedForDisabledWorkload(t *testing.T) {
|
||||
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
|
||||
{ServiceType: "relay-node", DesiredState: "disabled", RuntimeMode: "container"},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("apply desired workload: %v", err)
|
||||
}
|
||||
if statuses[0].ReportedState != "stopped" {
|
||||
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
FROM golang:1.23-bookworm AS build
|
||||
|
||||
WORKDIR /src
|
||||
|
||||
COPY go.mod go.sum ./
|
||||
RUN go mod download
|
||||
|
||||
COPY . .
|
||||
RUN CGO_ENABLED=0 GOOS=linux go build -o /out/rap-api ./cmd/api
|
||||
|
||||
FROM debian:bookworm-slim
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY --from=build /out/rap-api /usr/local/bin/rap-api
|
||||
|
||||
ENTRYPOINT ["/usr/local/bin/rap-api"]
|
||||
@@ -0,0 +1,270 @@
|
||||
# Backend Foundation
|
||||
|
||||
Production-oriented Go backend skeleton for the remote access platform.
|
||||
|
||||
## Scope included
|
||||
|
||||
- configuration loading from environment
|
||||
- HTTP server bootstrap with graceful shutdown
|
||||
- PostgreSQL and Redis connectivity wiring
|
||||
- migrations scaffold
|
||||
- auth foundation with access/refresh tokens, hashed refresh rotation, trusted devices, and persisted auth sessions
|
||||
- persistent session storage foundation for remote sessions, attachments, resource policies, and audit events
|
||||
- session broker orchestration for start, attach, detach, takeover, terminate, failure, and detached-session recovery
|
||||
- Redis-backed live session state, controller binding, attach tokens, heartbeat keys, worker routing, and reconnect support
|
||||
- Redis-backed worker registration, lease lifecycle, heartbeat tracking, stale lease recovery, and routing queues
|
||||
- worker assignment queueing and worker event ingestion for the minimal real RDP worker runtime
|
||||
- websocket live plane with attach handshake, ping/pong heartbeat, state messages, takeover detection, and transport reconnect flow
|
||||
- module boundaries for auth, resources, session broker, and websocket gateway
|
||||
- worker registry scaffold to prepare later RDP worker integration
|
||||
- per-resource certificate verification policy for RDP connections with `strict` default and explicit `ignore` override
|
||||
- platform-core v2 foundations for organizations, memberships, identity sources, nodes, and node-agent control plane
|
||||
- Data Plane v1 contract scaffolding for optional session response candidates/tokens, with current backend gateway behavior preserved as fallback
|
||||
- production resource secret-readiness guard for rejecting plaintext credential-like metadata and requiring `secret_ref` for RDP/VNC/SSH resources in production mode
|
||||
- encrypted resource secret storage/resolver MVP for production `secret_ref` usage
|
||||
|
||||
## Entry point
|
||||
|
||||
Run the API from `cmd/api`.
|
||||
|
||||
## Local dev
|
||||
|
||||
- backend: `pwsh -File scripts/smoke/run-backend.ps1`
|
||||
- infra: `pwsh -File scripts/smoke/start-infra.ps1`
|
||||
- migrations: `pwsh -File scripts/smoke/apply-migrations.ps1`
|
||||
- worker image build: `docker build --tag rap-rdp-worker:dev --file workers/rdp-worker/Dockerfile workers/rdp-worker`
|
||||
- end-to-end smoke path: [scripts/smoke/README.md](/\\?\UNC\192.168.220.200\mst\codex\rdp-proxy\scripts\smoke\README.md)
|
||||
|
||||
## Configuration
|
||||
|
||||
Use `configs/api.example.env` as the starting point for local environment variables.
|
||||
|
||||
Resource secret-readiness is controlled by `APP_ENV`:
|
||||
|
||||
- in `APP_ENV=production` or `APP_ENV=prod`, RDP/VNC/SSH resources must carry
|
||||
`secret_ref` and must not include plaintext credential-like fields in
|
||||
`metadata`
|
||||
- in development and smoke environments, plaintext metadata remains allowed
|
||||
until the encrypted secret resolver is implemented
|
||||
- the production guard is enforced both on resource create/update and on
|
||||
session start, so legacy plaintext resources cannot be started in production
|
||||
accidentally
|
||||
- `SECRET_ENCRYPTION_KEY_B64` or `SECRET_ENCRYPTION_KEY_FILE` supplies the
|
||||
AES-256-GCM master key for the MVP encrypted store; production mode refuses
|
||||
to start without one
|
||||
- `SECRET_ENCRYPTION_KEY_ID` labels the active key version in stored records
|
||||
- `PUT /api/v1/resources/{resourceID}/secret` creates or rotates a resource
|
||||
secret and updates `resources.secret_ref`; plaintext is never returned by the
|
||||
API
|
||||
- session assignment keeps PostgreSQL metadata safe: `remote_sessions.metadata`
|
||||
stores `secret_ref`, while resolved credentials are merged only into the
|
||||
transient worker assignment after session/worker/lease checks
|
||||
|
||||
See `docs/architecture/SECURITY_SECRETS_READINESS.md` for the target
|
||||
secret-reference model and remaining resolver/PKI gaps.
|
||||
|
||||
Data Plane v1 contract scaffolding is controlled by:
|
||||
|
||||
- `DATA_PLANE_TOKEN_TTL`, default `1m`
|
||||
- `DATA_PLANE_TOKEN_PRIVATE_KEY_FILE`, optional path to an RSA private key PEM used to sign RS256 data-plane tokens
|
||||
- `DATA_PLANE_TOKEN_PRIVATE_KEY_PEM`, optional inline RSA private key PEM; used when file path is not configured
|
||||
- `DATA_PLANE_BACKEND_GATEWAY_URL`, default `/api/v1/gateway/ws`
|
||||
- `DATA_PLANE_DIRECT_WORKER_WSS_URL_TEMPLATE`, optional; supports `{worker_id}` replacement
|
||||
- `DATA_PLANE_DIRECT_WORKER_JSON_RUNTIME`, default `false`; advertises
|
||||
`runtime_transport=json_v1` only after the worker direct JSON bridge is
|
||||
deployed and verified
|
||||
- `DATA_PLANE_DIRECT_WORKER_BINARY_RENDER`, default `false`; when the direct
|
||||
JSON runtime is enabled, advertises `render_transport=binary_v1` so DP-2
|
||||
clients can request binary render frames over direct worker WSS. Binary
|
||||
render candidates also advertise `supported_color_modes=["full_color","grayscale"]`
|
||||
and `default_color_mode="full_color"` for the DP-3A grayscale foundation.
|
||||
- `DATA_PLANE_DIRECT_WORKER_TLS_TRUST_MODE`, default `smoke_insecure`; allowed
|
||||
values are `smoke_insecure`, `public_ca`, and `platform_ca`.
|
||||
- `DATA_PLANE_DIRECT_WORKER_TLS_CA_REF`, optional label for the platform CA or
|
||||
trust bundle version advertised to clients.
|
||||
|
||||
Data-plane tokens are RS256-signed. The backend must hold only the private key;
|
||||
workers receive only the matching public key for validation. If no private key
|
||||
is configured, the backend omits the optional `data_plane` offer and the
|
||||
backend gateway fallback remains unchanged.
|
||||
|
||||
If no direct worker WSS URL template is configured, session responses still include the backend gateway fallback candidate only.
|
||||
If the URL template is configured but `DATA_PLANE_DIRECT_WORKER_JSON_RUNTIME`
|
||||
is `false`, the direct candidate is still present for contract visibility but is
|
||||
not marked data-capable; DP-1D Windows clients will skip it and use the backend
|
||||
gateway fallback.
|
||||
If `DATA_PLANE_DIRECT_WORKER_BINARY_RENDER` is `false`, direct worker WSS
|
||||
remains JSON/base64 for render. If it is `true`, only direct worker WSS render
|
||||
is binary; backend gateway fallback remains JSON/base64.
|
||||
In production, the backend does not advertise direct worker WSS when
|
||||
`DATA_PLANE_DIRECT_WORKER_TLS_TRUST_MODE=smoke_insecure`; it keeps the backend
|
||||
gateway fallback instead. Trusted direct candidates include `tls_trust_mode`,
|
||||
`production_trusted`, `smoke_only`, and optional `tls_ca_ref` metadata. See
|
||||
`docs/architecture/DIRECT_WORKER_TLS_PKI.md`.
|
||||
|
||||
## Module layout
|
||||
|
||||
- `internal/platform` shared runtime, config, infra, and bootstrap concerns
|
||||
- `internal/modules/auth` auth and trusted-device boundary
|
||||
- `internal/modules/organization` organization model, org roles, and memberships
|
||||
- `internal/modules/identitysource` local/LDAP/OIDC identity source model and future mapping foundations
|
||||
- `internal/modules/resource` remote resource inventory boundary
|
||||
- `internal/modules/sessionbroker` persistent session lifecycle, orchestration, audit, and Redis live-state boundary
|
||||
- `internal/modules/sessiongateway` websocket attach/reconnect/takeover transport boundary
|
||||
- `internal/modules/worker` worker registration, lease coordination, and control-plane routing boundary for future C++ RDP workers
|
||||
- `internal/modules/node` node inventory, capabilities, enabled services, update policy, and partition state
|
||||
- `internal/modules/nodeagent` node-agent registration, health, service status, and update/rollback control interface
|
||||
- `pkg/contracts` cross-module contracts for sessions and worker control
|
||||
|
||||
## Backend responsibilities
|
||||
|
||||
- PostgreSQL remains the source of truth for auth sessions, devices, remote sessions, attachments, resource policies, and audit events
|
||||
- Redis is used only for live routing and coordination: attach tokens, controller bindings, live session cache, worker registration, worker leases, heartbeats, and routing queues
|
||||
- `worker:control:<worker_id>` carries worker assignments, `worker:queue:<session_id>` carries live control/input envelopes, and `worker:events` carries worker-reported lifecycle events back into broker processing
|
||||
- Session broker owns state transitions and orchestration rules; websocket handlers call broker services instead of talking to postgres repositories directly
|
||||
- Worker runtime stays behind interfaces and Redis coordination so the backend remains isolated from FreeRDP implementation details while the minimal real RDP worker plugs into the control plane
|
||||
- RDP certificate verification is configured per resource through `certificate_verification_mode`
|
||||
- resources are now org-scoped in PostgreSQL and remote sessions persist their owning organization without changing the proven worker/session runtime contracts
|
||||
- session start/attach/takeover responses may include optional `data_plane` candidates and a short-lived signed data-plane token for DP-1 direct worker WSS migration; existing clients continue to use the current gateway path, and direct realtime use remains gated by explicit candidate metadata
|
||||
|
||||
## Authorization model
|
||||
|
||||
- `platform_admin` and `platform_recovery_admin` have global access across organizations, resources, and sessions
|
||||
- in `INSTALLATION_AUTHORITY_MODE=strict`, platform-admin power is effective only
|
||||
when the user also has a valid signed row in `platform_role_grants`; changing
|
||||
`users.platform_role` in PostgreSQL alone no longer grants owner access
|
||||
- first-owner bootstrap is available at
|
||||
`POST /api/v1/installation/bootstrap-owner` and requires a Product Root
|
||||
Ed25519 signature over an activation manifest in strict mode
|
||||
- production (`APP_ENV=production` or `prod`) requires strict installation
|
||||
authority plus `INSTALLATION_PRODUCT_ROOT_PUBLIC_KEY_B64` or
|
||||
`INSTALLATION_PRODUCT_ROOT_PUBLIC_KEY_FILE`
|
||||
- legacy/dev installs can keep database-role behavior, and insecure first-owner
|
||||
bootstrap is available only when
|
||||
`INSTALLATION_INSECURE_BOOTSTRAP_ENABLED=true`
|
||||
- `org_owner` and `org_admin` can create and update resources inside their organization and can manage any remote session inside that organization
|
||||
- active non-admin memberships such as `org_operator`, `org_member`, and `org_viewer` are deny-by-default for admin actions; they can only access org-scoped reads and operate on their own session flows where the session broker explicitly allows it
|
||||
- session start always authorizes the actor against the resource organization before worker reservation
|
||||
- attach, detach, takeover, and terminate authorize against the owning remote session organization before any state transition is written
|
||||
- worker-facing events do not bypass this model for user-originated commands; internal worker failure and heartbeat paths remain broker-internal control-plane operations
|
||||
|
||||
## Migration safety
|
||||
|
||||
- `000005_platform_core_v2` bootstraps a single `default` organization and backfills existing `resources.organization_id` and `remote_sessions.organization_id` into that organization before setting `NOT NULL`
|
||||
- `000006_default_org_memberships_backfill` safely restores access continuity by inserting missing active memberships for existing users into the `default` organization
|
||||
- the backfill is idempotent because it only inserts rows missing under the `(organization_id, user_id)` uniqueness constraint
|
||||
- platform administrators are backfilled as `org_owner` in the default organization, while other existing users are backfilled as `org_member`
|
||||
- if `000005` fails before the `NOT NULL` step, PostgreSQL rolls back the transaction and leaves pre-v2 rows untouched; if `000006` is rerun, it skips already-created memberships rather than duplicating them
|
||||
|
||||
## Platform-Core V2 Notes
|
||||
|
||||
- `organizations`, `organization_memberships`, and `organization_roles` establish multi-tenant ownership and basic org-scoped authorization boundaries
|
||||
- `identity_sources` and `identity_mappings` are foundation-only in this phase; full LDAP/OIDC sync and claim/group ingestion are intentionally deferred
|
||||
- `nodes`, `node_capabilities`, `node_services`, `node_update_policies`, `node_partition_states`, and `node_agent_update_runs` provide the first control-plane model for node and node-agent lifecycle
|
||||
- current proven RDP session lifecycle remains preserved: the session broker still orchestrates the same worker/session behavior, but it now records organization ownership via org-scoped resources
|
||||
- PostgreSQL remains the source of truth for organizations, memberships, org-scoped resources, identity sources, nodes, node-agent state, and session lifecycle state
|
||||
|
||||
## Resource Certificate Verification
|
||||
|
||||
- `strict` is the default and keeps normal certificate validation enabled in the worker runtime
|
||||
- `ignore` must be explicitly stored on the resource and allows that one RDP connection to skip certificate validation
|
||||
- the backend passes this policy through session assignment data; it is not a global backend toggle
|
||||
|
||||
## Messaging Model
|
||||
|
||||
- HTTP errors now use a structured envelope:
|
||||
- `error.code`
|
||||
- `error.message_key`
|
||||
- `error.fallback_message`
|
||||
- `error.details`
|
||||
- `error.trace_id`
|
||||
- `internal/platform/httpx` owns error normalization and trace-id generation so handlers can keep calling `WriteError(...)` without changing business logic.
|
||||
- For `5xx` responses, user-facing payloads are normalized to an English generic fallback message while logs and diagnostics can still keep raw internal details elsewhere.
|
||||
- For `4xx` responses, stable `code` and `message_key` are derived from the current fallback message, so clients can localize without depending on raw English text as the primary contract.
|
||||
|
||||
## WebSocket Messaging
|
||||
|
||||
- Session gateway envelopes keep the existing `type` and `payload` contract.
|
||||
- User-facing websocket events now also include `event` with:
|
||||
- `code`
|
||||
- `message_key`
|
||||
- `fallback_message`
|
||||
- `details`
|
||||
- `trace_id`
|
||||
- `session.taken_over`, terminal `session.state`, `transport.closed`, and protocol-level errors now carry this structured event object.
|
||||
- Existing payload semantics remain intact for compatibility with the already proven session lifecycle.
|
||||
|
||||
## Message Rules
|
||||
|
||||
- Keep English as the only development language for `fallback_message`, logs, and diagnostics.
|
||||
- New HTTP handlers should prefer `httpx.WriteError(...)` for user-facing failures instead of hand-building `"error": "..."` JSON.
|
||||
- New websocket user-facing notifications should populate `TransportEnvelope.Event` with a stable `code` and `message_key`.
|
||||
- Do not use raw human-readable English text as the primary client contract; it should only remain as fallback text.
|
||||
- This messaging layer is now runtime-proven against the live Windows smoke flow for invalid-login errors, websocket takeover delivery, websocket state fallback rendering, and worker-death failure handling.
|
||||
## Clipboard Policy
|
||||
|
||||
RDP text clipboard is controlled per resource through `resource_policies.clipboard_mode`.
|
||||
Allowed values are `disabled`, `client_to_server`, `server_to_client`, and
|
||||
`bidirectional`; the default is `disabled`. The legacy `clipboard_enabled`
|
||||
column is retained only for compatibility and migration/backfill, while new
|
||||
runtime decisions use `clipboard_mode`.
|
||||
|
||||
Clipboard enforcement happens in the real data path:
|
||||
|
||||
- `sessionbroker.ResourcePolicy.ClipboardMode` is loaded from PostgreSQL and
|
||||
embedded into the session assignment metadata sent to the worker.
|
||||
- `sessiongateway.Module.handleEnvelope` blocks client-to-server clipboard
|
||||
envelopes unless the session is `active` and the policy allows that direction.
|
||||
- `worker.EventProcessor` sends worker-originated clipboard text through
|
||||
`sessionbroker.Service.UpdateWorkerClipboardText`, which applies the same
|
||||
active-state and server-to-client policy checks before updating live state.
|
||||
- Clipboard messages carry `sequence_id`, `origin`, and `content_hash` so
|
||||
clients and workers can avoid feedback loops across reattach/takeover paths.
|
||||
- Redis stores clipboard text only as transient live state for routing to the
|
||||
active controller; PostgreSQL remains authoritative for policy/session state.
|
||||
|
||||
## File Upload Policy
|
||||
|
||||
Stage 5.1 introduces client-to-server file upload as a policy-gated RDP
|
||||
feature. The authoritative policy field is
|
||||
`resource_policies.file_transfer_mode`; allowed values are `disabled`,
|
||||
`client_to_server`, `server_to_client`, and `bidirectional`, but only
|
||||
`client_to_server` behavior is implemented in this stage. The default is
|
||||
`disabled`. The legacy `file_transfer_enabled` column is retained only as a
|
||||
derived compatibility flag and must not be treated as the primary policy.
|
||||
|
||||
Enforcement is deliberately duplicated in the real data path:
|
||||
|
||||
- `resource.Module` exposes `file_transfer_mode` in resource create, update,
|
||||
list, and read payloads.
|
||||
- `sessionbroker.Service.StartRemoteSession` embeds `file_transfer_mode` into
|
||||
assignment metadata and requests the worker `file-transfer` capability only
|
||||
when client-to-server upload is allowed.
|
||||
- `sessiongateway.Module.handleFileUploadStart` and
|
||||
`handleFileUploadChunk` require an active session, current controller,
|
||||
allowed policy mode, valid UUID `transfer_id`, safe file name, 25 MiB max
|
||||
file size, and 256 KiB max chunk size before routing chunks to the worker.
|
||||
- Redis is used only to route bounded upload envelopes to the worker. The file
|
||||
itself is written by the worker to controlled worker storage; PostgreSQL
|
||||
remains authoritative for policy and session state.
|
||||
|
||||
## File Download Policy
|
||||
|
||||
Stage 5.2 adds a runtime-proven server-to-client download path for RDP. The
|
||||
policy field remains `resource_policies.file_transfer_mode`; `server_to_client`
|
||||
and `bidirectional` allow download, while `disabled` and `client_to_server`
|
||||
block it. The default remains `disabled`.
|
||||
|
||||
The v1 download model uses only the restricted `RAP_Transfers\ToClient`
|
||||
drop-zone inside the existing per-session visible transfer directory. Backend
|
||||
gateway accepts only `file_download.start`, `file_download.ack`, and
|
||||
`file_download.cancel` from the current controller of an active session and
|
||||
routes them to the worker after policy validation. Worker-origin
|
||||
`file_download.*` events are stored only as transient live state for
|
||||
backend-gateway fallback delivery; PostgreSQL remains authoritative for
|
||||
session/resource/policy state and must not store file contents.
|
||||
|
||||
The direct worker WSS path is also lifecycle-gated: detach returns
|
||||
`file_download.blocked`, old-controller takeover returns `session.taken_over`,
|
||||
and worker failure closes the direct transport after PostgreSQL transitions the
|
||||
session to `failed`.
|
||||
@@ -0,0 +1,24 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/runtime"
|
||||
)
|
||||
|
||||
func main() {
|
||||
ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||
defer stop()
|
||||
|
||||
app, err := runtime.NewApp(ctx)
|
||||
if err != nil {
|
||||
log.Fatalf("bootstrap app: %v", err)
|
||||
}
|
||||
|
||||
if err := app.Run(ctx); err != nil {
|
||||
log.Fatalf("run app: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,94 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/websocket"
|
||||
)
|
||||
|
||||
func main() {
|
||||
var (
|
||||
gatewayURL = flag.String("gateway-url", "ws://127.0.0.1:8080/api/v1/gateway/ws", "websocket gateway url")
|
||||
attachToken = flag.String("attach-token", "", "short-lived attach token")
|
||||
duration = flag.Duration("duration", 90*time.Second, "maximum time to keep the client attached")
|
||||
heartbeatInterval = flag.Duration("heartbeat-interval", 10*time.Second, "client heartbeat interval")
|
||||
exitOnTakenOver = flag.Bool("exit-on-taken-over", true, "exit successfully after receiving session.taken_over")
|
||||
)
|
||||
flag.Parse()
|
||||
|
||||
if *attachToken == "" {
|
||||
log.Fatal("attach-token is required")
|
||||
}
|
||||
|
||||
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
|
||||
defer cancel()
|
||||
if *duration > 0 {
|
||||
var timeoutCancel context.CancelFunc
|
||||
ctx, timeoutCancel = context.WithTimeout(ctx, *duration)
|
||||
defer timeoutCancel()
|
||||
}
|
||||
|
||||
targetURL, err := url.Parse(*gatewayURL)
|
||||
if err != nil {
|
||||
log.Fatalf("parse gateway url: %v", err)
|
||||
}
|
||||
query := targetURL.Query()
|
||||
query.Set("attach_token", *attachToken)
|
||||
targetURL.RawQuery = query.Encode()
|
||||
|
||||
conn, _, err := websocket.DefaultDialer.DialContext(ctx, targetURL.String(), nil)
|
||||
if err != nil {
|
||||
log.Fatalf("dial websocket: %v", err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
done := make(chan error, 1)
|
||||
go func() {
|
||||
for {
|
||||
_, payload, err := conn.ReadMessage()
|
||||
if err != nil {
|
||||
done <- err
|
||||
return
|
||||
}
|
||||
fmt.Println(string(payload))
|
||||
if *exitOnTakenOver && containsTakenOver(payload) {
|
||||
done <- nil
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
heartbeatTicker := time.NewTicker(*heartbeatInterval)
|
||||
defer heartbeatTicker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case err := <-done:
|
||||
if err != nil {
|
||||
log.Fatalf("websocket read: %v", err)
|
||||
}
|
||||
return
|
||||
case <-heartbeatTicker.C:
|
||||
if err := conn.WriteJSON(map[string]any{"type": "heartbeat"}); err != nil {
|
||||
log.Fatalf("heartbeat write: %v", err)
|
||||
}
|
||||
case <-ctx.Done():
|
||||
if err := ctx.Err(); err != nil && err != context.Canceled && err != context.DeadlineExceeded {
|
||||
log.Fatalf("context ended: %v", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func containsTakenOver(payload []byte) bool {
|
||||
return strings.Contains(string(payload), `"type":"session.taken_over"`)
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
APP_NAME=rap-api
|
||||
APP_ENV=development
|
||||
HTTP_HOST=0.0.0.0
|
||||
HTTP_PORT=8080
|
||||
HTTP_READ_TIMEOUT=15s
|
||||
HTTP_WRITE_TIMEOUT=15s
|
||||
HTTP_IDLE_TIMEOUT=60s
|
||||
HTTP_SHUTDOWN_TIMEOUT=10s
|
||||
POSTGRES_DSN=postgres://rap_user:rap_password@localhost:5432/remote_access_platform?sslmode=disable
|
||||
POSTGRES_MAX_CONNS=20
|
||||
POSTGRES_MIN_CONNS=2
|
||||
POSTGRES_CONNECT_TIMEOUT=5s
|
||||
REDIS_ADDR=localhost:6379
|
||||
REDIS_PASSWORD=
|
||||
REDIS_DB=0
|
||||
REDIS_DIAL_TIMEOUT=5s
|
||||
AUTH_ACCESS_TOKEN_TTL=15m
|
||||
AUTH_REFRESH_TOKEN_TTL=720h
|
||||
AUTH_ISSUER=rap-api
|
||||
AUTH_ACCESS_TOKEN_SECRET=change-me-access-secret
|
||||
AUTH_REFRESH_HASH_SECRET=change-me-refresh-hash-secret
|
||||
DATA_PLANE_TOKEN_TTL=1m
|
||||
DATA_PLANE_TOKEN_PRIVATE_KEY_FILE=
|
||||
DATA_PLANE_TOKEN_PRIVATE_KEY_PEM=
|
||||
DATA_PLANE_BACKEND_GATEWAY_URL=/api/v1/gateway/ws
|
||||
DATA_PLANE_DIRECT_WORKER_WSS_URL_TEMPLATE=
|
||||
DATA_PLANE_DIRECT_WORKER_JSON_RUNTIME=false
|
||||
DATA_PLANE_DIRECT_WORKER_BINARY_RENDER=false
|
||||
DATA_PLANE_DIRECT_WORKER_TLS_TRUST_MODE=smoke_insecure
|
||||
DATA_PLANE_DIRECT_WORKER_TLS_CA_REF=
|
||||
SECRET_ENCRYPTION_KEY_B64=
|
||||
SECRET_ENCRYPTION_KEY_FILE=
|
||||
SECRET_ENCRYPTION_KEY_ID=local-v1
|
||||
SESSION_HEARTBEAT_TTL=90s
|
||||
SESSION_DETACH_GRACE_PERIOD=30m
|
||||
SESSION_ATTACH_TOKEN_TTL=2m
|
||||
SESSION_LIVE_STATE_TTL=2m
|
||||
SESSION_RECOVERY_BATCH_SIZE=100
|
||||
WORKER_LEASE_TTL=45s
|
||||
WORKER_HEARTBEAT_TTL=15s
|
||||
WORKER_STALE_LEASE_GRACE_PERIOD=30s
|
||||
WEBSOCKET_WRITE_TIMEOUT=10s
|
||||
WEBSOCKET_PING_INTERVAL=20s
|
||||
WEBSOCKET_PONG_WAIT=40s
|
||||
@@ -0,0 +1,23 @@
|
||||
module github.com/example/remote-access-platform/backend
|
||||
|
||||
go 1.23.2
|
||||
|
||||
require (
|
||||
github.com/go-chi/chi/v5 v5.2.1
|
||||
github.com/golang-jwt/jwt/v5 v5.2.2
|
||||
github.com/google/uuid v1.6.0
|
||||
github.com/gorilla/websocket v1.5.3
|
||||
github.com/jackc/pgx/v5 v5.7.4
|
||||
github.com/redis/go-redis/v9 v9.8.0
|
||||
golang.org/x/crypto v0.37.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
|
||||
github.com/jackc/pgpassfile v1.0.0 // indirect
|
||||
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
|
||||
github.com/jackc/puddle/v2 v2.2.2 // indirect
|
||||
golang.org/x/sync v0.13.0 // indirect
|
||||
golang.org/x/text v0.24.0 // indirect
|
||||
)
|
||||
@@ -0,0 +1,46 @@
|
||||
github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
|
||||
github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
|
||||
github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
|
||||
github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
|
||||
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
||||
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
|
||||
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
|
||||
github.com/go-chi/chi/v5 v5.2.1 h1:KOIHODQj58PmL80G2Eak4WdvUzjSJSm0vG72crDCqb8=
|
||||
github.com/go-chi/chi/v5 v5.2.1/go.mod h1:L2yAIGWB3H+phAw1NxKwWM+7eUH/lU8pOMm5hHcoops=
|
||||
github.com/golang-jwt/jwt/v5 v5.2.2 h1:Rl4B7itRWVtYIHFrSNd7vhTiz9UpLdi6gZhZ3wEeDy8=
|
||||
github.com/golang-jwt/jwt/v5 v5.2.2/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
|
||||
github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
|
||||
github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
|
||||
github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
|
||||
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
|
||||
github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
|
||||
github.com/jackc/pgx/v5 v5.7.4 h1:9wKznZrhWa2QiHL+NjTSPP6yjl3451BX3imWDnokYlg=
|
||||
github.com/jackc/pgx/v5 v5.7.4/go.mod h1:ncY89UGWxg82EykZUwSpUKEfccBGGYq1xjrOpsbsfGQ=
|
||||
github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
|
||||
github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/redis/go-redis/v9 v9.8.0 h1:q3nRvjrlge/6UD7eTu/DSg2uYiU2mCL0G/uzBWqhicI=
|
||||
github.com/redis/go-redis/v9 v9.8.0/go.mod h1:huWgSWd8mW6+m0VPhJjSSQ+d6Nh1VICQ6Q5lHuCH/Iw=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
|
||||
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
||||
golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE=
|
||||
golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc=
|
||||
golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610=
|
||||
golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
|
||||
golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0=
|
||||
golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
@@ -0,0 +1,19 @@
|
||||
package auth
|
||||
|
||||
import "errors"
|
||||
|
||||
var (
|
||||
ErrInvalidCredentials = errors.New("invalid credentials")
|
||||
ErrInvalidRefreshToken = errors.New("invalid refresh token")
|
||||
ErrAuthSessionRevoked = errors.New("auth session revoked")
|
||||
ErrDeviceRevoked = errors.New("device revoked")
|
||||
ErrDeviceNotTrusted = errors.New("device not trusted")
|
||||
ErrAuthSessionNotFound = errors.New("auth session not found")
|
||||
ErrTrustedDeviceMissing = errors.New("trusted device not found")
|
||||
|
||||
ErrInstallationAlreadyBootstrapped = errors.New("installation is already bootstrapped")
|
||||
ErrInstallationActivationRequired = errors.New("signed installation activation is required")
|
||||
ErrInvalidInstallationActivation = errors.New("invalid installation activation")
|
||||
ErrInsecureBootstrapDisabled = errors.New("insecure installation bootstrap is disabled")
|
||||
ErrInvalidBootstrapOwner = errors.New("invalid bootstrap owner")
|
||||
)
|
||||
@@ -0,0 +1,114 @@
|
||||
package auth
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"time"
|
||||
)
|
||||
|
||||
type DeviceTrustStatus string
|
||||
|
||||
const (
|
||||
DeviceTrustStatusPending DeviceTrustStatus = "pending"
|
||||
DeviceTrustStatusTrusted DeviceTrustStatus = "trusted"
|
||||
DeviceTrustStatusRevoked DeviceTrustStatus = "revoked"
|
||||
)
|
||||
|
||||
type User struct {
|
||||
ID string
|
||||
Email string
|
||||
PasswordHash string
|
||||
MFAEnabled bool
|
||||
CreatedAt time.Time
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type Device struct {
|
||||
ID string
|
||||
UserID string
|
||||
Fingerprint string
|
||||
Label string
|
||||
TrustStatus DeviceTrustStatus
|
||||
TrustedAt *time.Time
|
||||
LastSeenAt *time.Time
|
||||
RevokedAt *time.Time
|
||||
RevokedReason *string
|
||||
CreatedAt time.Time
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type AuthSession struct {
|
||||
ID string
|
||||
UserID string
|
||||
DeviceID string
|
||||
RefreshTokenHash string
|
||||
RefreshExpiresAt time.Time
|
||||
LastSeenAt *time.Time
|
||||
LastRotatedAt *time.Time
|
||||
RevokedAt *time.Time
|
||||
RevokedReason *string
|
||||
CreatedAt time.Time
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type LoginCommand struct {
|
||||
Email string `json:"email"`
|
||||
Password string `json:"password"`
|
||||
DeviceFingerprint string `json:"device_fingerprint"`
|
||||
DeviceLabel string `json:"device_label"`
|
||||
TrustDevice bool `json:"trust_device"`
|
||||
}
|
||||
|
||||
type RefreshCommand struct {
|
||||
RefreshToken string `json:"refresh_token"`
|
||||
}
|
||||
|
||||
type BootstrapOwnerCommand struct {
|
||||
Email string `json:"email"`
|
||||
Password string `json:"password"`
|
||||
ActivationPayload json.RawMessage `json:"activation_payload"`
|
||||
ActivationSignature string `json:"activation_signature"`
|
||||
}
|
||||
|
||||
type RevokeAuthSessionCommand struct {
|
||||
UserID string `json:"user_id"`
|
||||
AuthSessionID string `json:"auth_session_id"`
|
||||
Reason string `json:"reason"`
|
||||
}
|
||||
|
||||
type RevokeDeviceCommand struct {
|
||||
UserID string `json:"user_id"`
|
||||
DeviceID string `json:"device_id"`
|
||||
Reason string `json:"reason"`
|
||||
}
|
||||
|
||||
type TokenPair struct {
|
||||
AccessToken string `json:"access_token"`
|
||||
AccessTokenExpiresAt time.Time `json:"access_token_expires_at"`
|
||||
RefreshToken string `json:"refresh_token"`
|
||||
RefreshTokenExpiresAt time.Time `json:"refresh_token_expires_at"`
|
||||
}
|
||||
|
||||
type AuthResult struct {
|
||||
User User `json:"user"`
|
||||
Device Device `json:"device"`
|
||||
AuthSession AuthSession `json:"auth_session"`
|
||||
Tokens TokenPair `json:"tokens"`
|
||||
}
|
||||
|
||||
type InstallationStatus struct {
|
||||
Bootstrapped bool `json:"bootstrapped"`
|
||||
AuthorityState string `json:"authority_state"`
|
||||
InstallID string `json:"install_id,omitempty"`
|
||||
BootstrappedOwnerEmail string `json:"bootstrapped_owner_email,omitempty"`
|
||||
BootstrappedAt *time.Time `json:"bootstrapped_at,omitempty"`
|
||||
AuthorityMode string `json:"authority_mode"`
|
||||
StrictAuthority bool `json:"strict_authority"`
|
||||
RootFingerprint string `json:"root_fingerprint,omitempty"`
|
||||
InsecureBootstrapAllowed bool `json:"insecure_bootstrap_allowed"`
|
||||
}
|
||||
|
||||
type BootstrapOwnerResult struct {
|
||||
Installation InstallationStatus `json:"installation"`
|
||||
User User `json:"user"`
|
||||
PlatformRole string `json:"platform_role"`
|
||||
}
|
||||
@@ -0,0 +1,173 @@
|
||||
package auth
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/httpx"
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/module"
|
||||
)
|
||||
|
||||
type Module struct {
|
||||
service *Service
|
||||
}
|
||||
|
||||
func NewModule(deps module.Dependencies, service *Service) *Module {
|
||||
return &Module{service: service}
|
||||
}
|
||||
|
||||
func (m *Module) Name() string {
|
||||
return "auth"
|
||||
}
|
||||
|
||||
func (m *Module) RegisterRoutes(router chi.Router) {
|
||||
router.Route("/installation", func(r chi.Router) {
|
||||
r.Get("/status", m.handleInstallationStatus)
|
||||
r.Post("/bootstrap-owner", m.handleBootstrapOwner)
|
||||
})
|
||||
router.Route("/auth", func(r chi.Router) {
|
||||
r.Post("/login", m.handleLogin)
|
||||
r.Post("/refresh", m.handleRefresh)
|
||||
r.Post("/sessions/revoke", m.handleRevokeAuthSession)
|
||||
r.Get("/devices", m.handleTrustedDevices)
|
||||
r.Post("/devices/{deviceID}/revoke", m.handleRevokeTrustedDevice)
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) handleInstallationStatus(w http.ResponseWriter, r *http.Request) {
|
||||
status, err := m.service.InstallationStatus(r.Context())
|
||||
if err != nil {
|
||||
statusCode, message := m.service.MapError(err)
|
||||
httpx.WriteError(w, statusCode, message)
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"installation": status})
|
||||
}
|
||||
|
||||
func (m *Module) handleBootstrapOwner(w http.ResponseWriter, r *http.Request) {
|
||||
var cmd BootstrapOwnerCommand
|
||||
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid installation bootstrap payload")
|
||||
return
|
||||
}
|
||||
result, err := m.service.BootstrapOwner(r.Context(), cmd)
|
||||
if err != nil {
|
||||
status, message := m.service.MapError(err)
|
||||
httpx.WriteError(w, status, message)
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusCreated, result)
|
||||
}
|
||||
|
||||
func (m *Module) handleLogin(w http.ResponseWriter, r *http.Request) {
|
||||
var cmd LoginCommand
|
||||
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid login payload")
|
||||
return
|
||||
}
|
||||
|
||||
result, err := m.service.Login(r.Context(), cmd)
|
||||
if err != nil {
|
||||
status, message := m.service.MapError(err)
|
||||
httpx.WriteError(w, status, message)
|
||||
return
|
||||
}
|
||||
|
||||
httpx.WriteJSON(w, http.StatusOK, result)
|
||||
}
|
||||
|
||||
func (m *Module) handleRefresh(w http.ResponseWriter, r *http.Request) {
|
||||
var cmd RefreshCommand
|
||||
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid refresh payload")
|
||||
return
|
||||
}
|
||||
|
||||
result, err := m.service.Refresh(r.Context(), cmd)
|
||||
if err != nil {
|
||||
status, message := m.service.MapError(err)
|
||||
httpx.WriteError(w, status, message)
|
||||
return
|
||||
}
|
||||
|
||||
httpx.WriteJSON(w, http.StatusOK, result)
|
||||
}
|
||||
|
||||
func (m *Module) handleRevokeAuthSession(w http.ResponseWriter, r *http.Request) {
|
||||
var cmd RevokeAuthSessionCommand
|
||||
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid auth session revoke payload")
|
||||
return
|
||||
}
|
||||
|
||||
if err := m.service.RevokeAuthSession(r.Context(), cmd); err != nil {
|
||||
status, message := m.service.MapError(err)
|
||||
httpx.WriteError(w, status, message)
|
||||
return
|
||||
}
|
||||
|
||||
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{
|
||||
"status": "revoked",
|
||||
"message": httpx.NewMessage(
|
||||
"auth.session.revoked",
|
||||
"status.auth.session.revoked",
|
||||
"Auth session revoked.",
|
||||
nil,
|
||||
"",
|
||||
),
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) handleTrustedDevices(w http.ResponseWriter, r *http.Request) {
|
||||
userID := r.URL.Query().Get("user_id")
|
||||
if userID == "" {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "user_id is required")
|
||||
return
|
||||
}
|
||||
|
||||
devices, err := m.service.ListTrustedDevices(r.Context(), userID)
|
||||
if err != nil {
|
||||
status, message := m.service.MapError(err)
|
||||
httpx.WriteError(w, status, message)
|
||||
return
|
||||
}
|
||||
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{
|
||||
"devices": devices,
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) handleRevokeTrustedDevice(w http.ResponseWriter, r *http.Request) {
|
||||
var payload struct {
|
||||
UserID string `json:"user_id"`
|
||||
Reason string `json:"reason"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid device revoke payload")
|
||||
return
|
||||
}
|
||||
|
||||
err := m.service.RevokeTrustedDevice(r.Context(), RevokeDeviceCommand{
|
||||
UserID: payload.UserID,
|
||||
DeviceID: chi.URLParam(r, "deviceID"),
|
||||
Reason: payload.Reason,
|
||||
})
|
||||
if err != nil {
|
||||
status, message := m.service.MapError(err)
|
||||
httpx.WriteError(w, status, message)
|
||||
return
|
||||
}
|
||||
|
||||
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{
|
||||
"status": "revoked",
|
||||
"message": httpx.NewMessage(
|
||||
"auth.device.revoked",
|
||||
"status.auth.device.revoked",
|
||||
"Trusted device revoked.",
|
||||
nil,
|
||||
"",
|
||||
),
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,525 @@
|
||||
package auth
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5"
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
|
||||
postgresplatform "github.com/example/remote-access-platform/backend/internal/platform/postgres"
|
||||
)
|
||||
|
||||
type postgresStore struct {
|
||||
db postgresplatform.DBTX
|
||||
}
|
||||
|
||||
type PostgresTransactor struct {
|
||||
pool *pgxpool.Pool
|
||||
}
|
||||
|
||||
func NewPostgresStore(pool *pgxpool.Pool) Store {
|
||||
return &postgresStore{db: pool}
|
||||
}
|
||||
|
||||
func NewPostgresTransactor(pool *pgxpool.Pool) *PostgresTransactor {
|
||||
return &PostgresTransactor{pool: pool}
|
||||
}
|
||||
|
||||
func (t *PostgresTransactor) WithinTransaction(ctx context.Context, fn func(store Store) error) error {
|
||||
return postgresplatform.WithTransaction(ctx, t.pool, func(tx pgx.Tx) error {
|
||||
return fn(&postgresStore{db: tx})
|
||||
})
|
||||
}
|
||||
|
||||
func (s *postgresStore) Users() UserRepository {
|
||||
return &postgresUserRepository{db: s.db}
|
||||
}
|
||||
|
||||
func (s *postgresStore) Devices() DeviceRepository {
|
||||
return &postgresDeviceRepository{db: s.db}
|
||||
}
|
||||
|
||||
func (s *postgresStore) AuthSessions() AuthSessionRepository {
|
||||
return &postgresAuthSessionRepository{db: s.db}
|
||||
}
|
||||
|
||||
func (s *postgresStore) Installation() InstallationRepository {
|
||||
return &postgresInstallationRepository{db: s.db}
|
||||
}
|
||||
|
||||
type postgresUserRepository struct {
|
||||
db postgresplatform.DBTX
|
||||
}
|
||||
|
||||
type postgresDeviceRepository struct {
|
||||
db postgresplatform.DBTX
|
||||
}
|
||||
|
||||
type postgresAuthSessionRepository struct {
|
||||
db postgresplatform.DBTX
|
||||
}
|
||||
|
||||
type postgresInstallationRepository struct {
|
||||
db postgresplatform.DBTX
|
||||
}
|
||||
|
||||
func (r *postgresUserRepository) GetByEmail(ctx context.Context, email string) (*User, error) {
|
||||
const query = `
|
||||
SELECT id::text, email, password_hash, mfa_enabled, created_at, updated_at
|
||||
FROM users
|
||||
WHERE email = $1
|
||||
`
|
||||
return scanOptionalUser(r.db.QueryRow(ctx, query, email))
|
||||
}
|
||||
|
||||
func (r *postgresUserRepository) GetByID(ctx context.Context, userID string) (*User, error) {
|
||||
const query = `
|
||||
SELECT id::text, email, password_hash, mfa_enabled, created_at, updated_at
|
||||
FROM users
|
||||
WHERE id = $1::uuid
|
||||
`
|
||||
return scanOptionalUser(r.db.QueryRow(ctx, query, userID))
|
||||
}
|
||||
|
||||
func (r *postgresDeviceRepository) Upsert(ctx context.Context, params UpsertDeviceParams) (*Device, error) {
|
||||
const query = `
|
||||
INSERT INTO devices (
|
||||
user_id,
|
||||
device_fingerprint,
|
||||
device_label,
|
||||
trust_status,
|
||||
trusted_at,
|
||||
last_seen_at,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
$1::uuid,
|
||||
$2,
|
||||
$3,
|
||||
CASE WHEN $4 THEN 'trusted' ELSE 'pending' END,
|
||||
CASE WHEN $4 THEN $5::timestamptz ELSE NULL::timestamptz END,
|
||||
$5::timestamptz,
|
||||
$5::timestamptz,
|
||||
$5::timestamptz
|
||||
)
|
||||
ON CONFLICT (user_id, device_fingerprint) DO UPDATE SET
|
||||
device_label = EXCLUDED.device_label,
|
||||
last_seen_at = EXCLUDED.last_seen_at,
|
||||
updated_at = EXCLUDED.updated_at,
|
||||
trust_status = CASE
|
||||
WHEN devices.trust_status = 'revoked' THEN devices.trust_status
|
||||
WHEN devices.trust_status = 'trusted' THEN devices.trust_status
|
||||
WHEN EXCLUDED.trust_status = 'trusted' THEN 'trusted'
|
||||
ELSE devices.trust_status
|
||||
END,
|
||||
trusted_at = CASE
|
||||
WHEN devices.trust_status = 'trusted' THEN devices.trusted_at
|
||||
WHEN EXCLUDED.trust_status = 'trusted' THEN EXCLUDED.trusted_at
|
||||
ELSE devices.trusted_at
|
||||
END
|
||||
RETURNING
|
||||
id::text, user_id::text, device_fingerprint, COALESCE(device_label, ''),
|
||||
trust_status, trusted_at, last_seen_at, revoked_at, revoked_reason, created_at, updated_at
|
||||
`
|
||||
return scanDevice(r.db.QueryRow(ctx, query,
|
||||
params.UserID,
|
||||
params.Fingerprint,
|
||||
params.Label,
|
||||
params.TrustRequested,
|
||||
params.SeenAt,
|
||||
))
|
||||
}
|
||||
|
||||
func (r *postgresDeviceRepository) GetByIDForUser(ctx context.Context, userID, deviceID string) (*Device, error) {
|
||||
const query = `
|
||||
SELECT id::text, user_id::text, device_fingerprint, COALESCE(device_label, ''),
|
||||
trust_status, trusted_at, last_seen_at, revoked_at, revoked_reason, created_at, updated_at
|
||||
FROM devices
|
||||
WHERE id = $1::uuid AND user_id = $2::uuid
|
||||
`
|
||||
device, err := scanDevice(r.db.QueryRow(ctx, query, deviceID, userID))
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
return nil, nil
|
||||
}
|
||||
return device, err
|
||||
}
|
||||
|
||||
func (r *postgresDeviceRepository) ListTrustedByUser(ctx context.Context, userID string) ([]Device, error) {
|
||||
const query = `
|
||||
SELECT id::text, user_id::text, device_fingerprint, COALESCE(device_label, ''),
|
||||
trust_status, trusted_at, last_seen_at, revoked_at, revoked_reason, created_at, updated_at
|
||||
FROM devices
|
||||
WHERE user_id = $1::uuid AND trust_status = 'trusted' AND revoked_at IS NULL
|
||||
ORDER BY created_at DESC
|
||||
`
|
||||
rows, err := r.db.Query(ctx, query, userID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("query trusted devices: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var devices []Device
|
||||
for rows.Next() {
|
||||
device, err := scanDevice(rows)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
devices = append(devices, *device)
|
||||
}
|
||||
return devices, rows.Err()
|
||||
}
|
||||
|
||||
func (r *postgresDeviceRepository) Revoke(ctx context.Context, params RevokeDeviceParams) error {
|
||||
const query = `
|
||||
UPDATE devices
|
||||
SET trust_status = 'revoked',
|
||||
revoked_at = $3,
|
||||
revoked_reason = $4,
|
||||
updated_at = $3
|
||||
WHERE id = $1::uuid AND user_id = $2::uuid
|
||||
`
|
||||
if _, err := r.db.Exec(ctx, query, params.DeviceID, params.UserID, params.RevokedAt, params.Reason); err != nil {
|
||||
return fmt.Errorf("revoke device: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *postgresAuthSessionRepository) Create(ctx context.Context, session AuthSession) error {
|
||||
const query = `
|
||||
INSERT INTO auth_sessions (
|
||||
id,
|
||||
user_id,
|
||||
device_id,
|
||||
refresh_token_hash,
|
||||
refresh_expires_at,
|
||||
last_seen_at,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES ($1::uuid, $2::uuid, $3::uuid, $4, $5, $6, $7, $8)
|
||||
`
|
||||
if _, err := r.db.Exec(ctx, query,
|
||||
session.ID,
|
||||
session.UserID,
|
||||
session.DeviceID,
|
||||
session.RefreshTokenHash,
|
||||
session.RefreshExpiresAt,
|
||||
session.LastSeenAt,
|
||||
session.CreatedAt,
|
||||
session.UpdatedAt,
|
||||
); err != nil {
|
||||
return fmt.Errorf("create auth session: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *postgresAuthSessionRepository) GetByID(ctx context.Context, authSessionID string) (*AuthSession, error) {
|
||||
return r.getByID(ctx, authSessionID, "")
|
||||
}
|
||||
|
||||
func (r *postgresAuthSessionRepository) GetByIDForUpdate(ctx context.Context, authSessionID string) (*AuthSession, error) {
|
||||
return r.getByID(ctx, authSessionID, " FOR UPDATE")
|
||||
}
|
||||
|
||||
func (r *postgresAuthSessionRepository) getByID(ctx context.Context, authSessionID string, suffix string) (*AuthSession, error) {
|
||||
query := `
|
||||
SELECT id::text, user_id::text, device_id::text, refresh_token_hash, refresh_expires_at,
|
||||
last_seen_at, last_rotated_at, revoked_at, revoked_reason, created_at, updated_at
|
||||
FROM auth_sessions
|
||||
WHERE id = $1::uuid` + suffix
|
||||
session, err := scanAuthSession(r.db.QueryRow(ctx, query, authSessionID))
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
return nil, nil
|
||||
}
|
||||
return session, err
|
||||
}
|
||||
|
||||
func (r *postgresAuthSessionRepository) Rotate(ctx context.Context, params RotateAuthSessionParams) error {
|
||||
const query = `
|
||||
UPDATE auth_sessions
|
||||
SET refresh_token_hash = $2,
|
||||
refresh_expires_at = $3,
|
||||
last_seen_at = $4,
|
||||
last_rotated_at = $5,
|
||||
updated_at = $5
|
||||
WHERE id = $1::uuid AND revoked_at IS NULL
|
||||
`
|
||||
if _, err := r.db.Exec(ctx, query,
|
||||
params.AuthSessionID,
|
||||
params.RefreshTokenHash,
|
||||
params.RefreshExpiresAt,
|
||||
params.LastSeenAt,
|
||||
params.LastRotatedAt,
|
||||
); err != nil {
|
||||
return fmt.Errorf("rotate auth session: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *postgresAuthSessionRepository) Touch(ctx context.Context, authSessionID string, seenAt time.Time) error {
|
||||
const query = `
|
||||
UPDATE auth_sessions
|
||||
SET last_seen_at = $2, updated_at = $2
|
||||
WHERE id = $1::uuid AND revoked_at IS NULL
|
||||
`
|
||||
if _, err := r.db.Exec(ctx, query, authSessionID, seenAt); err != nil {
|
||||
return fmt.Errorf("touch auth session: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *postgresAuthSessionRepository) Revoke(ctx context.Context, params RevokeAuthSessionParams) error {
|
||||
const query = `
|
||||
UPDATE auth_sessions
|
||||
SET revoked_at = $3,
|
||||
revoked_reason = $4,
|
||||
updated_at = $3
|
||||
WHERE id = $1::uuid AND user_id = $2::uuid AND revoked_at IS NULL
|
||||
`
|
||||
if _, err := r.db.Exec(ctx, query, params.AuthSessionID, params.UserID, params.RevokedAt, params.Reason); err != nil {
|
||||
return fmt.Errorf("revoke auth session: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *postgresAuthSessionRepository) RevokeByDevice(ctx context.Context, userID, deviceID, reason string, revokedAt time.Time) error {
|
||||
const query = `
|
||||
UPDATE auth_sessions
|
||||
SET revoked_at = $3,
|
||||
revoked_reason = $4,
|
||||
updated_at = $3
|
||||
WHERE user_id = $1::uuid AND device_id = $2::uuid AND revoked_at IS NULL
|
||||
`
|
||||
if _, err := r.db.Exec(ctx, query, userID, deviceID, revokedAt, reason); err != nil {
|
||||
return fmt.Errorf("revoke auth sessions by device: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *postgresInstallationRepository) GetStatus(ctx context.Context) (*InstallationAuthorityState, error) {
|
||||
const query = `
|
||||
SELECT install_id, authority_state, product_root_key_fingerprint, bootstrapped_owner_email, bootstrapped_at
|
||||
FROM installation_authority
|
||||
WHERE id = 1
|
||||
`
|
||||
status := &InstallationAuthorityState{}
|
||||
if err := r.db.QueryRow(ctx, query).Scan(
|
||||
&status.InstallID,
|
||||
&status.AuthorityState,
|
||||
&status.ProductRootFingerprint,
|
||||
&status.BootstrappedOwnerEmail,
|
||||
&status.BootstrappedAt,
|
||||
); err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
return &InstallationAuthorityState{
|
||||
Bootstrapped: false,
|
||||
AuthorityState: "unbootstrapped",
|
||||
}, nil
|
||||
}
|
||||
return nil, fmt.Errorf("get installation status: %w", err)
|
||||
}
|
||||
status.Bootstrapped = true
|
||||
return status, nil
|
||||
}
|
||||
|
||||
func (r *postgresInstallationRepository) BootstrapOwner(ctx context.Context, params BootstrapOwnerParams) (*User, error) {
|
||||
var existingInstallID string
|
||||
if err := r.db.QueryRow(ctx, `
|
||||
SELECT install_id
|
||||
FROM installation_authority
|
||||
WHERE id = 1
|
||||
FOR UPDATE
|
||||
`).Scan(&existingInstallID); err != nil && !errors.Is(err, pgx.ErrNoRows) {
|
||||
return nil, fmt.Errorf("lock installation authority: %w", err)
|
||||
} else if err == nil {
|
||||
return nil, ErrInstallationAlreadyBootstrapped
|
||||
}
|
||||
|
||||
email := strings.ToLower(strings.TrimSpace(params.Email))
|
||||
now := params.Now.UTC()
|
||||
user, err := scanOptionalUser(r.db.QueryRow(ctx, `
|
||||
INSERT INTO users (email, password_hash, mfa_enabled, platform_role, created_at, updated_at)
|
||||
VALUES ($1, $2, FALSE, $3, $4, $4)
|
||||
ON CONFLICT (email) DO UPDATE SET
|
||||
password_hash = EXCLUDED.password_hash,
|
||||
platform_role = EXCLUDED.platform_role,
|
||||
updated_at = EXCLUDED.updated_at
|
||||
RETURNING id::text, email, password_hash, mfa_enabled, created_at, updated_at
|
||||
`, email, params.PasswordHash, params.Role, now))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("upsert bootstrap owner: %w", err)
|
||||
}
|
||||
if user == nil {
|
||||
return nil, fmt.Errorf("upsert bootstrap owner returned no user")
|
||||
}
|
||||
|
||||
payload := json.RawMessage(`{}`)
|
||||
if len(params.ActivationPayload) > 0 {
|
||||
payload = params.ActivationPayload
|
||||
}
|
||||
if _, err := r.db.Exec(ctx, `
|
||||
INSERT INTO installation_authority (
|
||||
id,
|
||||
install_id,
|
||||
authority_state,
|
||||
product_root_key_fingerprint,
|
||||
activation_payload,
|
||||
activation_signature,
|
||||
bootstrapped_owner_email,
|
||||
bootstrapped_at,
|
||||
created_at,
|
||||
updated_at
|
||||
) VALUES (
|
||||
1,
|
||||
$1,
|
||||
'active',
|
||||
$2,
|
||||
$3::jsonb,
|
||||
$4,
|
||||
$5,
|
||||
$6,
|
||||
$6,
|
||||
$6
|
||||
)
|
||||
`, params.InstallID, params.ProductRootKeyFingerprint, []byte(payload), params.ActivationSignature, email, now); err != nil {
|
||||
return nil, fmt.Errorf("insert installation authority: %w", err)
|
||||
}
|
||||
|
||||
if _, err := r.db.Exec(ctx, `
|
||||
UPDATE platform_role_grants
|
||||
SET revoked_at = $4
|
||||
WHERE user_id = $1::uuid
|
||||
AND role = $2
|
||||
AND install_id = $3
|
||||
AND revoked_at IS NULL
|
||||
`, user.ID, params.Role, params.InstallID, now); err != nil {
|
||||
return nil, fmt.Errorf("revoke superseded platform role grants: %w", err)
|
||||
}
|
||||
if _, err := r.db.Exec(ctx, `
|
||||
INSERT INTO platform_role_grants (
|
||||
user_id,
|
||||
role,
|
||||
install_id,
|
||||
grant_payload,
|
||||
grant_signature,
|
||||
grant_source,
|
||||
granted_at,
|
||||
expires_at,
|
||||
metadata
|
||||
) VALUES (
|
||||
$1::uuid,
|
||||
$2,
|
||||
$3,
|
||||
$4::jsonb,
|
||||
$5,
|
||||
$6,
|
||||
$7,
|
||||
$8,
|
||||
'{"bootstrap_owner":true}'::jsonb
|
||||
)
|
||||
`, user.ID, params.Role, params.InstallID, []byte(payload), params.ActivationSignature, params.GrantSource, now, params.ExpiresAt); err != nil {
|
||||
return nil, fmt.Errorf("insert platform role grant: %w", err)
|
||||
}
|
||||
|
||||
if _, err := r.db.Exec(ctx, `
|
||||
INSERT INTO organization_memberships (
|
||||
organization_id,
|
||||
user_id,
|
||||
role_id,
|
||||
status,
|
||||
invited_by_user_id,
|
||||
created_at,
|
||||
updated_at
|
||||
)
|
||||
SELECT id, $1::uuid, 'org_owner', 'active', $1::uuid, $2, $2
|
||||
FROM organizations
|
||||
WHERE slug = 'default'
|
||||
ON CONFLICT (organization_id, user_id) DO UPDATE SET
|
||||
role_id = 'org_owner',
|
||||
status = 'active',
|
||||
invited_by_user_id = EXCLUDED.invited_by_user_id,
|
||||
updated_at = EXCLUDED.updated_at
|
||||
`, user.ID, now); err != nil {
|
||||
return nil, fmt.Errorf("upsert default organization owner membership: %w", err)
|
||||
}
|
||||
|
||||
return user, nil
|
||||
}
|
||||
|
||||
type scanner interface {
|
||||
Scan(dest ...any) error
|
||||
}
|
||||
|
||||
func scanOptionalUser(row scanner) (*User, error) {
|
||||
user := &User{}
|
||||
if err := row.Scan(
|
||||
&user.ID,
|
||||
&user.Email,
|
||||
&user.PasswordHash,
|
||||
&user.MFAEnabled,
|
||||
&user.CreatedAt,
|
||||
&user.UpdatedAt,
|
||||
); err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, fmt.Errorf("scan user: %w", err)
|
||||
}
|
||||
return user, nil
|
||||
}
|
||||
|
||||
func scanDevice(row scanner) (*Device, error) {
|
||||
device := &Device{}
|
||||
var trustedAt, lastSeenAt, revokedAt *time.Time
|
||||
var revokedReason *string
|
||||
if err := row.Scan(
|
||||
&device.ID,
|
||||
&device.UserID,
|
||||
&device.Fingerprint,
|
||||
&device.Label,
|
||||
&device.TrustStatus,
|
||||
&trustedAt,
|
||||
&lastSeenAt,
|
||||
&revokedAt,
|
||||
&revokedReason,
|
||||
&device.CreatedAt,
|
||||
&device.UpdatedAt,
|
||||
); err != nil {
|
||||
return nil, fmt.Errorf("scan device: %w", err)
|
||||
}
|
||||
device.TrustedAt = trustedAt
|
||||
device.LastSeenAt = lastSeenAt
|
||||
device.RevokedAt = revokedAt
|
||||
device.RevokedReason = revokedReason
|
||||
return device, nil
|
||||
}
|
||||
|
||||
func scanAuthSession(row scanner) (*AuthSession, error) {
|
||||
session := &AuthSession{}
|
||||
var lastSeenAt, lastRotatedAt, revokedAt *time.Time
|
||||
var revokedReason *string
|
||||
if err := row.Scan(
|
||||
&session.ID,
|
||||
&session.UserID,
|
||||
&session.DeviceID,
|
||||
&session.RefreshTokenHash,
|
||||
&session.RefreshExpiresAt,
|
||||
&lastSeenAt,
|
||||
&lastRotatedAt,
|
||||
&revokedAt,
|
||||
&revokedReason,
|
||||
&session.CreatedAt,
|
||||
&session.UpdatedAt,
|
||||
); err != nil {
|
||||
return nil, fmt.Errorf("scan auth session: %w", err)
|
||||
}
|
||||
session.LastSeenAt = lastSeenAt
|
||||
session.LastRotatedAt = lastRotatedAt
|
||||
session.RevokedAt = revokedAt
|
||||
session.RevokedReason = revokedReason
|
||||
return session, nil
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
package auth
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"time"
|
||||
)
|
||||
|
||||
type UserRepository interface {
|
||||
GetByEmail(ctx context.Context, email string) (*User, error)
|
||||
GetByID(ctx context.Context, userID string) (*User, error)
|
||||
}
|
||||
|
||||
type DeviceRepository interface {
|
||||
Upsert(ctx context.Context, params UpsertDeviceParams) (*Device, error)
|
||||
GetByIDForUser(ctx context.Context, userID, deviceID string) (*Device, error)
|
||||
ListTrustedByUser(ctx context.Context, userID string) ([]Device, error)
|
||||
Revoke(ctx context.Context, params RevokeDeviceParams) error
|
||||
}
|
||||
|
||||
type AuthSessionRepository interface {
|
||||
Create(ctx context.Context, session AuthSession) error
|
||||
GetByID(ctx context.Context, authSessionID string) (*AuthSession, error)
|
||||
GetByIDForUpdate(ctx context.Context, authSessionID string) (*AuthSession, error)
|
||||
Rotate(ctx context.Context, params RotateAuthSessionParams) error
|
||||
Touch(ctx context.Context, authSessionID string, seenAt time.Time) error
|
||||
Revoke(ctx context.Context, params RevokeAuthSessionParams) error
|
||||
RevokeByDevice(ctx context.Context, userID, deviceID, reason string, revokedAt time.Time) error
|
||||
}
|
||||
|
||||
type InstallationRepository interface {
|
||||
GetStatus(ctx context.Context) (*InstallationAuthorityState, error)
|
||||
BootstrapOwner(ctx context.Context, params BootstrapOwnerParams) (*User, error)
|
||||
}
|
||||
|
||||
type Store interface {
|
||||
Users() UserRepository
|
||||
Devices() DeviceRepository
|
||||
AuthSessions() AuthSessionRepository
|
||||
Installation() InstallationRepository
|
||||
}
|
||||
|
||||
type Transactor interface {
|
||||
WithinTransaction(ctx context.Context, fn func(store Store) error) error
|
||||
}
|
||||
|
||||
type UpsertDeviceParams struct {
|
||||
UserID string
|
||||
Fingerprint string
|
||||
Label string
|
||||
TrustRequested bool
|
||||
SeenAt time.Time
|
||||
}
|
||||
|
||||
type RotateAuthSessionParams struct {
|
||||
AuthSessionID string
|
||||
RefreshTokenHash string
|
||||
RefreshExpiresAt time.Time
|
||||
LastSeenAt time.Time
|
||||
LastRotatedAt time.Time
|
||||
}
|
||||
|
||||
type RevokeAuthSessionParams struct {
|
||||
AuthSessionID string
|
||||
UserID string
|
||||
Reason string
|
||||
RevokedAt time.Time
|
||||
}
|
||||
|
||||
type RevokeDeviceParams struct {
|
||||
UserID string
|
||||
DeviceID string
|
||||
Reason string
|
||||
RevokedAt time.Time
|
||||
}
|
||||
|
||||
type InstallationAuthorityState struct {
|
||||
Bootstrapped bool
|
||||
AuthorityState string
|
||||
InstallID string
|
||||
ProductRootFingerprint string
|
||||
BootstrappedOwnerEmail string
|
||||
BootstrappedAt *time.Time
|
||||
}
|
||||
|
||||
type BootstrapOwnerParams struct {
|
||||
Email string
|
||||
PasswordHash string
|
||||
Role string
|
||||
InstallID string
|
||||
ProductRootKeyFingerprint string
|
||||
ActivationPayload json.RawMessage
|
||||
ActivationSignature string
|
||||
GrantSource string
|
||||
ExpiresAt *time.Time
|
||||
Now time.Time
|
||||
}
|
||||
@@ -0,0 +1,440 @@
|
||||
package auth
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"golang.org/x/crypto/bcrypt"
|
||||
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/authority"
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/module"
|
||||
)
|
||||
|
||||
type Service struct {
|
||||
cfg module.Config
|
||||
store Store
|
||||
transactor Transactor
|
||||
tokenManager *TokenManager
|
||||
authority *authority.Verifier
|
||||
now func() time.Time
|
||||
}
|
||||
|
||||
func NewService(deps module.Dependencies, store Store, transactor Transactor, verifiers ...*authority.Verifier) *Service {
|
||||
var authorityVerifier *authority.Verifier
|
||||
if len(verifiers) > 0 {
|
||||
authorityVerifier = verifiers[0]
|
||||
} else if verifier, err := authority.NewVerifier(deps.Config.Installation); err == nil {
|
||||
authorityVerifier = verifier
|
||||
}
|
||||
return &Service{
|
||||
cfg: deps.Config,
|
||||
store: store,
|
||||
transactor: transactor,
|
||||
tokenManager: NewTokenManager(TokenConfig{
|
||||
Issuer: deps.Config.Auth.Issuer,
|
||||
AccessTokenSecret: deps.Config.Auth.AccessTokenSecret,
|
||||
RefreshHashSecret: deps.Config.Auth.RefreshHashSecret,
|
||||
AccessTokenTTL: deps.Config.Auth.AccessTokenTTL,
|
||||
RefreshTokenTTL: deps.Config.Auth.RefreshTokenTTL,
|
||||
}),
|
||||
authority: authorityVerifier,
|
||||
now: time.Now,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Service) Login(ctx context.Context, cmd LoginCommand) (*AuthResult, error) {
|
||||
user, err := s.store.Users().GetByEmail(ctx, cmd.Email)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if user == nil {
|
||||
return nil, ErrInvalidCredentials
|
||||
}
|
||||
if err := bcrypt.CompareHashAndPassword([]byte(user.PasswordHash), []byte(cmd.Password)); err != nil {
|
||||
return nil, ErrInvalidCredentials
|
||||
}
|
||||
|
||||
var result AuthResult
|
||||
now := s.now().UTC()
|
||||
|
||||
if err := s.transactor.WithinTransaction(ctx, func(store Store) error {
|
||||
device, err := store.Devices().Upsert(ctx, UpsertDeviceParams{
|
||||
UserID: user.ID,
|
||||
Fingerprint: cmd.DeviceFingerprint,
|
||||
Label: cmd.DeviceLabel,
|
||||
TrustRequested: cmd.TrustDevice,
|
||||
SeenAt: now,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if device.TrustStatus == DeviceTrustStatusRevoked {
|
||||
return ErrDeviceRevoked
|
||||
}
|
||||
|
||||
authSessionID := uuid.NewString()
|
||||
refreshToken, refreshHash, refreshExpiresAt, err := s.tokenManager.IssueRefreshToken(authSessionID, now)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
accessToken, accessExpiresAt, err := s.tokenManager.IssueAccessToken(user.ID, authSessionID, device.ID, now)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
session := AuthSession{
|
||||
ID: authSessionID,
|
||||
UserID: user.ID,
|
||||
DeviceID: device.ID,
|
||||
RefreshTokenHash: refreshHash,
|
||||
RefreshExpiresAt: refreshExpiresAt,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
LastSeenAt: &now,
|
||||
}
|
||||
if err := store.AuthSessions().Create(ctx, session); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
result = AuthResult{
|
||||
User: *user,
|
||||
Device: *device,
|
||||
AuthSession: session,
|
||||
Tokens: TokenPair{
|
||||
AccessToken: accessToken,
|
||||
AccessTokenExpiresAt: accessExpiresAt,
|
||||
RefreshToken: refreshToken,
|
||||
RefreshTokenExpiresAt: refreshExpiresAt,
|
||||
},
|
||||
}
|
||||
return nil
|
||||
}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
func (s *Service) Refresh(ctx context.Context, cmd RefreshCommand) (*AuthResult, error) {
|
||||
authSessionID, err := s.tokenManager.ParseRefreshToken(cmd.RefreshToken)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var result AuthResult
|
||||
now := s.now().UTC()
|
||||
|
||||
if err := s.transactor.WithinTransaction(ctx, func(store Store) error {
|
||||
session, err := store.AuthSessions().GetByIDForUpdate(ctx, authSessionID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if session == nil {
|
||||
return ErrInvalidRefreshToken
|
||||
}
|
||||
if session.RevokedAt != nil {
|
||||
return ErrAuthSessionRevoked
|
||||
}
|
||||
if now.After(session.RefreshExpiresAt) {
|
||||
if revokeErr := store.AuthSessions().Revoke(ctx, RevokeAuthSessionParams{
|
||||
AuthSessionID: session.ID,
|
||||
UserID: session.UserID,
|
||||
Reason: "refresh_token_expired",
|
||||
RevokedAt: now,
|
||||
}); revokeErr != nil {
|
||||
return revokeErr
|
||||
}
|
||||
return ErrInvalidRefreshToken
|
||||
}
|
||||
|
||||
expectedHash := s.tokenManager.HashRefreshToken(cmd.RefreshToken)
|
||||
if expectedHash != session.RefreshTokenHash {
|
||||
if revokeErr := store.AuthSessions().Revoke(ctx, RevokeAuthSessionParams{
|
||||
AuthSessionID: session.ID,
|
||||
UserID: session.UserID,
|
||||
Reason: "refresh_rotation_reuse_detected",
|
||||
RevokedAt: now,
|
||||
}); revokeErr != nil {
|
||||
return revokeErr
|
||||
}
|
||||
return ErrInvalidRefreshToken
|
||||
}
|
||||
|
||||
user, err := store.Users().GetByID(ctx, session.UserID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if user == nil {
|
||||
return ErrInvalidCredentials
|
||||
}
|
||||
|
||||
device, err := store.Devices().GetByIDForUser(ctx, session.UserID, session.DeviceID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if device == nil {
|
||||
return ErrTrustedDeviceMissing
|
||||
}
|
||||
if device.TrustStatus == DeviceTrustStatusRevoked {
|
||||
return ErrDeviceRevoked
|
||||
}
|
||||
|
||||
refreshToken, refreshHash, refreshExpiresAt, err := s.tokenManager.IssueRefreshToken(session.ID, now)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
accessToken, accessExpiresAt, err := s.tokenManager.IssueAccessToken(user.ID, session.ID, device.ID, now)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := store.AuthSessions().Rotate(ctx, RotateAuthSessionParams{
|
||||
AuthSessionID: session.ID,
|
||||
RefreshTokenHash: refreshHash,
|
||||
RefreshExpiresAt: refreshExpiresAt,
|
||||
LastSeenAt: now,
|
||||
LastRotatedAt: now,
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
result = AuthResult{
|
||||
User: *user,
|
||||
Device: *device,
|
||||
AuthSession: AuthSession{
|
||||
ID: session.ID,
|
||||
UserID: session.UserID,
|
||||
DeviceID: session.DeviceID,
|
||||
RefreshTokenHash: refreshHash,
|
||||
RefreshExpiresAt: refreshExpiresAt,
|
||||
LastSeenAt: &now,
|
||||
LastRotatedAt: &now,
|
||||
},
|
||||
Tokens: TokenPair{
|
||||
AccessToken: accessToken,
|
||||
AccessTokenExpiresAt: accessExpiresAt,
|
||||
RefreshToken: refreshToken,
|
||||
RefreshTokenExpiresAt: refreshExpiresAt,
|
||||
},
|
||||
}
|
||||
return nil
|
||||
}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
func (s *Service) InstallationStatus(ctx context.Context) (*InstallationStatus, error) {
|
||||
record, err := s.store.Installation().GetStatus(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return s.installationStatusFromRecord(record), nil
|
||||
}
|
||||
|
||||
func (s *Service) BootstrapOwner(ctx context.Context, cmd BootstrapOwnerCommand) (*BootstrapOwnerResult, error) {
|
||||
email := strings.ToLower(strings.TrimSpace(cmd.Email))
|
||||
password := strings.TrimSpace(cmd.Password)
|
||||
if email == "" || !strings.Contains(email, "@") || len(password) < 12 {
|
||||
return nil, ErrInvalidBootstrapOwner
|
||||
}
|
||||
|
||||
now := s.now().UTC()
|
||||
role := authority.PlatformRoleAdmin
|
||||
installID := ""
|
||||
grantSource := "installation_activation"
|
||||
rootFingerprint := ""
|
||||
activationPayload := cmd.ActivationPayload
|
||||
activationSignature := strings.TrimSpace(cmd.ActivationSignature)
|
||||
var expiresAt *time.Time
|
||||
|
||||
if s.strictAuthority() {
|
||||
if len(activationPayload) == 0 || activationSignature == "" {
|
||||
return nil, ErrInstallationActivationRequired
|
||||
}
|
||||
activation, err := s.authority.VerifyActivation(activationPayload, activationSignature)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%w: %v", ErrInvalidInstallationActivation, err)
|
||||
}
|
||||
if !strings.EqualFold(activation.OwnerEmail, email) {
|
||||
return nil, ErrInvalidInstallationActivation
|
||||
}
|
||||
role = activation.PlatformRole
|
||||
installID = activation.InstallID
|
||||
expiresAt = activation.ExpiresAt
|
||||
rootFingerprint = s.authority.RootFingerprint()
|
||||
} else {
|
||||
if s.authority == nil || !s.authority.AllowInsecureBootstrap() {
|
||||
return nil, ErrInsecureBootstrapDisabled
|
||||
}
|
||||
installID = uuid.NewString()
|
||||
grantSource = "dev_insecure"
|
||||
rootFingerprint = "dev-insecure"
|
||||
devPayload, err := json.Marshal(authority.ActivationPayload{
|
||||
SchemaVersion: authority.ActivationSchemaVersion,
|
||||
InstallID: installID,
|
||||
OwnerEmail: email,
|
||||
PlatformRole: role,
|
||||
IssuedAt: now,
|
||||
Environment: s.cfg.App.Env,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
activationPayload = json.RawMessage(devPayload)
|
||||
activationSignature = "dev-insecure"
|
||||
}
|
||||
|
||||
passwordHash, err := bcrypt.GenerateFromPassword([]byte(password), bcrypt.DefaultCost)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("hash bootstrap owner password: %w", err)
|
||||
}
|
||||
|
||||
var user *User
|
||||
if err := s.transactor.WithinTransaction(ctx, func(store Store) error {
|
||||
created, err := store.Installation().BootstrapOwner(ctx, BootstrapOwnerParams{
|
||||
Email: email,
|
||||
PasswordHash: string(passwordHash),
|
||||
Role: role,
|
||||
InstallID: installID,
|
||||
ProductRootKeyFingerprint: rootFingerprint,
|
||||
ActivationPayload: activationPayload,
|
||||
ActivationSignature: activationSignature,
|
||||
GrantSource: grantSource,
|
||||
ExpiresAt: expiresAt,
|
||||
Now: now,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
user = created
|
||||
return nil
|
||||
}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
status, err := s.InstallationStatus(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &BootstrapOwnerResult{
|
||||
Installation: *status,
|
||||
User: *user,
|
||||
PlatformRole: role,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *Service) RevokeAuthSession(ctx context.Context, cmd RevokeAuthSessionCommand) error {
|
||||
return s.transactor.WithinTransaction(ctx, func(store Store) error {
|
||||
session, err := store.AuthSessions().GetByIDForUpdate(ctx, cmd.AuthSessionID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if session == nil || session.UserID != cmd.UserID {
|
||||
return ErrAuthSessionNotFound
|
||||
}
|
||||
return store.AuthSessions().Revoke(ctx, RevokeAuthSessionParams{
|
||||
AuthSessionID: cmd.AuthSessionID,
|
||||
UserID: cmd.UserID,
|
||||
Reason: cmd.Reason,
|
||||
RevokedAt: s.now().UTC(),
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
func (s *Service) RevokeTrustedDevice(ctx context.Context, cmd RevokeDeviceCommand) error {
|
||||
return s.transactor.WithinTransaction(ctx, func(store Store) error {
|
||||
device, err := store.Devices().GetByIDForUser(ctx, cmd.UserID, cmd.DeviceID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if device == nil {
|
||||
return ErrTrustedDeviceMissing
|
||||
}
|
||||
if device.TrustStatus != DeviceTrustStatusTrusted {
|
||||
return ErrDeviceNotTrusted
|
||||
}
|
||||
|
||||
now := s.now().UTC()
|
||||
if err := store.Devices().Revoke(ctx, RevokeDeviceParams{
|
||||
UserID: cmd.UserID,
|
||||
DeviceID: cmd.DeviceID,
|
||||
Reason: cmd.Reason,
|
||||
RevokedAt: now,
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
return store.AuthSessions().RevokeByDevice(ctx, cmd.UserID, cmd.DeviceID, "device_revoked:"+cmd.Reason, now)
|
||||
})
|
||||
}
|
||||
|
||||
func (s *Service) ListTrustedDevices(ctx context.Context, userID string) ([]Device, error) {
|
||||
return s.store.Devices().ListTrustedByUser(ctx, userID)
|
||||
}
|
||||
|
||||
func (s *Service) MapError(err error) (int, string) {
|
||||
switch {
|
||||
case err == nil:
|
||||
return 0, ""
|
||||
case errors.Is(err, ErrInvalidCredentials):
|
||||
return 401, "invalid credentials"
|
||||
case errors.Is(err, ErrInvalidRefreshToken):
|
||||
return 401, "invalid refresh token"
|
||||
case errors.Is(err, ErrAuthSessionRevoked):
|
||||
return 401, "auth session revoked"
|
||||
case errors.Is(err, ErrDeviceRevoked):
|
||||
return 403, "device revoked"
|
||||
case errors.Is(err, ErrDeviceNotTrusted):
|
||||
return 409, "device is not trusted"
|
||||
case errors.Is(err, ErrAuthSessionNotFound), errors.Is(err, ErrTrustedDeviceMissing):
|
||||
return 404, err.Error()
|
||||
case errors.Is(err, ErrInstallationActivationRequired), errors.Is(err, ErrInvalidInstallationActivation), errors.Is(err, ErrInvalidBootstrapOwner):
|
||||
return 400, err.Error()
|
||||
case errors.Is(err, ErrInsecureBootstrapDisabled):
|
||||
return 403, err.Error()
|
||||
case errors.Is(err, ErrInstallationAlreadyBootstrapped):
|
||||
return 409, err.Error()
|
||||
default:
|
||||
return 500, fmt.Sprintf("internal error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Service) installationStatusFromRecord(record *InstallationAuthorityState) *InstallationStatus {
|
||||
if record == nil {
|
||||
record = &InstallationAuthorityState{AuthorityState: "unbootstrapped"}
|
||||
}
|
||||
mode := authority.ModeLegacy
|
||||
strict := false
|
||||
rootFingerprint := ""
|
||||
insecureAllowed := false
|
||||
if s.authority != nil {
|
||||
mode = s.authority.Mode()
|
||||
strict = s.authority.Strict()
|
||||
rootFingerprint = s.authority.RootFingerprint()
|
||||
insecureAllowed = s.authority.AllowInsecureBootstrap()
|
||||
}
|
||||
if record.ProductRootFingerprint != "" {
|
||||
rootFingerprint = record.ProductRootFingerprint
|
||||
}
|
||||
return &InstallationStatus{
|
||||
Bootstrapped: record.Bootstrapped,
|
||||
AuthorityState: record.AuthorityState,
|
||||
InstallID: record.InstallID,
|
||||
BootstrappedOwnerEmail: record.BootstrappedOwnerEmail,
|
||||
BootstrappedAt: record.BootstrappedAt,
|
||||
AuthorityMode: mode,
|
||||
StrictAuthority: strict,
|
||||
RootFingerprint: rootFingerprint,
|
||||
InsecureBootstrapAllowed: insecureAllowed,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Service) strictAuthority() bool {
|
||||
return s.authority != nil && s.authority.Strict()
|
||||
}
|
||||
@@ -0,0 +1,95 @@
|
||||
package auth
|
||||
|
||||
import (
|
||||
"crypto/hmac"
|
||||
"crypto/rand"
|
||||
"crypto/sha256"
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
)
|
||||
|
||||
type TokenManager struct {
|
||||
issuer string
|
||||
accessSecret []byte
|
||||
refreshHashSecret []byte
|
||||
accessTTL time.Duration
|
||||
refreshTTL time.Duration
|
||||
}
|
||||
|
||||
type AccessClaims struct {
|
||||
AuthSessionID string `json:"sid"`
|
||||
DeviceID string `json:"did"`
|
||||
jwt.RegisteredClaims
|
||||
}
|
||||
|
||||
func NewTokenManager(cfg TokenConfig) *TokenManager {
|
||||
return &TokenManager{
|
||||
issuer: cfg.Issuer,
|
||||
accessSecret: []byte(cfg.AccessTokenSecret),
|
||||
refreshHashSecret: []byte(cfg.RefreshHashSecret),
|
||||
accessTTL: cfg.AccessTokenTTL,
|
||||
refreshTTL: cfg.RefreshTokenTTL,
|
||||
}
|
||||
}
|
||||
|
||||
type TokenConfig struct {
|
||||
Issuer string
|
||||
AccessTokenSecret string
|
||||
RefreshHashSecret string
|
||||
AccessTokenTTL time.Duration
|
||||
RefreshTokenTTL time.Duration
|
||||
}
|
||||
|
||||
func (m *TokenManager) IssueAccessToken(userID, authSessionID, deviceID string, now time.Time) (string, time.Time, error) {
|
||||
expiresAt := now.Add(m.accessTTL)
|
||||
claims := AccessClaims{
|
||||
AuthSessionID: authSessionID,
|
||||
DeviceID: deviceID,
|
||||
RegisteredClaims: jwt.RegisteredClaims{
|
||||
Issuer: m.issuer,
|
||||
Subject: userID,
|
||||
ExpiresAt: jwt.NewNumericDate(expiresAt),
|
||||
IssuedAt: jwt.NewNumericDate(now),
|
||||
NotBefore: jwt.NewNumericDate(now),
|
||||
},
|
||||
}
|
||||
|
||||
token := jwt.NewWithClaims(jwt.SigningMethodHS256, claims)
|
||||
signed, err := token.SignedString(m.accessSecret)
|
||||
if err != nil {
|
||||
return "", time.Time{}, fmt.Errorf("sign access token: %w", err)
|
||||
}
|
||||
|
||||
return signed, expiresAt, nil
|
||||
}
|
||||
|
||||
func (m *TokenManager) IssueRefreshToken(authSessionID string, now time.Time) (raw string, hash string, expiresAt time.Time, err error) {
|
||||
secret := make([]byte, 32)
|
||||
if _, err = rand.Read(secret); err != nil {
|
||||
return "", "", time.Time{}, fmt.Errorf("read random refresh secret: %w", err)
|
||||
}
|
||||
|
||||
encodedSecret := base64.RawURLEncoding.EncodeToString(secret)
|
||||
raw = authSessionID + "." + encodedSecret
|
||||
hash = m.HashRefreshToken(raw)
|
||||
expiresAt = now.Add(m.refreshTTL)
|
||||
return raw, hash, expiresAt, nil
|
||||
}
|
||||
|
||||
func (m *TokenManager) HashRefreshToken(token string) string {
|
||||
mac := hmac.New(sha256.New, m.refreshHashSecret)
|
||||
_, _ = mac.Write([]byte(token))
|
||||
return base64.RawURLEncoding.EncodeToString(mac.Sum(nil))
|
||||
}
|
||||
|
||||
func (m *TokenManager) ParseRefreshToken(token string) (string, error) {
|
||||
sessionID, _, ok := strings.Cut(token, ".")
|
||||
if !ok || sessionID == "" {
|
||||
return "", ErrInvalidRefreshToken
|
||||
}
|
||||
return sessionID, nil
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,34 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestMeshLatestObservationKeySeparatesRouteHealthByRoute(t *testing.T) {
|
||||
key := meshLatestObservationKey(json.RawMessage(`{
|
||||
"observation_type":"synthetic_route_health",
|
||||
"route_id":"route-1"
|
||||
}`))
|
||||
if key != "synthetic_route_health:route-1" {
|
||||
t.Fatalf("key = %q", key)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshLatestObservationKeySeparatesConnectionManagerMode(t *testing.T) {
|
||||
key := meshLatestObservationKey(json.RawMessage(`{
|
||||
"observation_type":"peer_connection_manager",
|
||||
"transport_mode":"relay_control",
|
||||
"relay_node_id":"node-r"
|
||||
}`))
|
||||
if key != "peer_connection_manager:relay_control:node-r" {
|
||||
t.Fatalf("key = %q", key)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMeshLatestObservationKeyDefaults(t *testing.T) {
|
||||
key := meshLatestObservationKey(json.RawMessage(`{}`))
|
||||
if key != "default" {
|
||||
t.Fatalf("key = %q", key)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"time"
|
||||
)
|
||||
|
||||
type Repository interface {
|
||||
GetPlatformRole(ctx context.Context, userID string) (string, error)
|
||||
|
||||
ListClusters(ctx context.Context) ([]Cluster, error)
|
||||
GetCluster(ctx context.Context, clusterID string) (Cluster, error)
|
||||
CreateCluster(ctx context.Context, input CreateClusterInput) (Cluster, error)
|
||||
UpdateCluster(ctx context.Context, input UpdateClusterInput) (Cluster, error)
|
||||
GetClusterAuthority(ctx context.Context, clusterID string) (ClusterAuthorityKey, error)
|
||||
EnsureClusterAuthority(ctx context.Context, clusterID string, actorUserID *string) (ClusterAuthorityKey, error)
|
||||
|
||||
ListClusterNodes(ctx context.Context, clusterID string) ([]ClusterNode, error)
|
||||
ListNodeGroups(ctx context.Context, clusterID string) ([]ClusterNodeGroup, error)
|
||||
CreateNodeGroup(ctx context.Context, input CreateNodeGroupInput) (ClusterNodeGroup, error)
|
||||
AssignNodeToGroup(ctx context.Context, input AssignNodeGroupInput) (ClusterNode, error)
|
||||
|
||||
CreateJoinToken(ctx context.Context, input CreateJoinTokenInput, tokenHash string) (NodeJoinToken, error)
|
||||
SetJoinTokenAuthority(ctx context.Context, clusterID, tokenID string, payload json.RawMessage, signature ClusterSignature) (NodeJoinToken, error)
|
||||
GetValidJoinTokenByHash(ctx context.Context, clusterID, tokenHash string) (NodeJoinToken, error)
|
||||
RevokeJoinToken(ctx context.Context, input RevokeJoinTokenInput) (NodeJoinToken, error)
|
||||
ExpireJoinTokens(ctx context.Context, clusterID string) error
|
||||
|
||||
CreateJoinRequest(ctx context.Context, input CreateJoinRequestInput, joinTokenID string) (NodeJoinRequest, error)
|
||||
GetJoinRequestForBootstrap(ctx context.Context, input GetJoinRequestBootstrapInput) (NodeJoinRequest, error)
|
||||
ListJoinRequests(ctx context.Context, clusterID string) ([]NodeJoinRequest, error)
|
||||
ApproveJoinRequest(ctx context.Context, input ApproveJoinRequestInput) (ApprovedJoinRequest, error)
|
||||
SetJoinRequestApprovalAuthority(ctx context.Context, clusterID, joinRequestID string, payload json.RawMessage, signature ClusterSignature) (NodeJoinRequest, error)
|
||||
RejectJoinRequest(ctx context.Context, input RejectJoinRequestInput) (NodeJoinRequest, error)
|
||||
|
||||
AssignNodeRole(ctx context.Context, input AssignNodeRoleInput) (NodeRoleAssignment, error)
|
||||
ListNodeRoleAssignments(ctx context.Context, clusterID, nodeID string) ([]NodeRoleAssignment, error)
|
||||
AttachExistingNodeToCluster(ctx context.Context, input AttachExistingNodeInput) (ClusterNode, error)
|
||||
|
||||
RecordHeartbeat(ctx context.Context, input RecordHeartbeatInput) (NodeHeartbeat, error)
|
||||
ListNodeHeartbeats(ctx context.Context, clusterID, nodeID string, limit int) ([]NodeHeartbeat, error)
|
||||
RevokeNodeIdentity(ctx context.Context, input RevokeNodeIdentityInput) error
|
||||
DisableClusterMembership(ctx context.Context, input DisableMembershipInput) error
|
||||
UpsertFabricTestingFlag(ctx context.Context, input UpsertFabricTestingFlagInput) (FabricTestingFlag, error)
|
||||
ListFabricTestingFlags(ctx context.Context) ([]FabricTestingFlag, error)
|
||||
GetEffectiveNodeTestingFlags(ctx context.Context, clusterID, nodeID string) (EffectiveNodeTestingFlags, error)
|
||||
RecordNodeTelemetry(ctx context.Context, input RecordNodeTelemetryInput) (NodeTelemetryObservation, error)
|
||||
ListNodeTelemetry(ctx context.Context, clusterID, nodeID string, limit int) ([]NodeTelemetryObservation, error)
|
||||
SetDesiredWorkload(ctx context.Context, input SetDesiredWorkloadInput) (NodeWorkloadDesiredState, error)
|
||||
ListDesiredWorkloads(ctx context.Context, clusterID, nodeID string) ([]NodeWorkloadDesiredState, error)
|
||||
ReportWorkloadStatus(ctx context.Context, input ReportWorkloadStatusInput) (NodeWorkloadStatus, error)
|
||||
ListLatestWorkloadStatuses(ctx context.Context, clusterID, nodeID string) ([]NodeWorkloadStatus, error)
|
||||
ReportMeshLink(ctx context.Context, input ReportMeshLinkInput) (MeshLinkObservation, error)
|
||||
ListMeshLinks(ctx context.Context, clusterID string) ([]MeshLinkObservation, error)
|
||||
CreateRouteIntent(ctx context.Context, input CreateRouteIntentInput) (MeshRouteIntent, error)
|
||||
ListRouteIntents(ctx context.Context, clusterID string) ([]MeshRouteIntent, error)
|
||||
ListQoSPolicies(ctx context.Context, clusterID string) ([]MeshQoSPolicy, error)
|
||||
ListFabricEntryPoints(ctx context.Context, clusterID string) ([]FabricEntryPoint, error)
|
||||
CreateFabricEntryPoint(ctx context.Context, input CreateFabricEntryPointInput) (FabricEntryPoint, error)
|
||||
SetFabricEntryPointNode(ctx context.Context, input SetFabricEntryPointNodeInput) (FabricEntryPointNode, error)
|
||||
ListFabricEntryPointNodes(ctx context.Context, clusterID, entryPointID string) ([]FabricEntryPointNode, error)
|
||||
ListFabricEgressPools(ctx context.Context, clusterID string) ([]FabricEgressPool, error)
|
||||
CreateFabricEgressPool(ctx context.Context, input CreateFabricEgressPoolInput) (FabricEgressPool, error)
|
||||
SetFabricEgressPoolNode(ctx context.Context, input SetFabricEgressPoolNodeInput) (FabricEgressPoolNode, error)
|
||||
ListFabricEgressPoolNodes(ctx context.Context, clusterID, egressPoolID string) ([]FabricEgressPoolNode, error)
|
||||
GetClusterAuthorityState(ctx context.Context, clusterID string) (ClusterAuthorityState, error)
|
||||
UpdateClusterAuthorityState(ctx context.Context, input UpdateClusterAuthorityInput) (ClusterAuthorityState, error)
|
||||
ListClusterAdminSummaries(ctx context.Context) ([]ClusterAdminSummary, error)
|
||||
|
||||
CreateVPNConnection(ctx context.Context, input CreateVPNConnectionInput) (VPNConnection, error)
|
||||
ListVPNConnections(ctx context.Context, clusterID string) ([]VPNConnection, error)
|
||||
GetVPNConnection(ctx context.Context, clusterID, vpnConnectionID string) (VPNConnection, error)
|
||||
UpdateVPNConnectionDesiredState(ctx context.Context, input UpdateVPNConnectionDesiredStateInput) (VPNConnection, error)
|
||||
UpsertVPNConnectionRoutePolicy(ctx context.Context, input UpsertVPNConnectionRoutePolicyInput) (VPNConnectionRoutePolicy, error)
|
||||
ListVPNConnectionRoutePolicies(ctx context.Context, clusterID, vpnConnectionID string) ([]VPNConnectionRoutePolicy, error)
|
||||
SetVPNConnectionAllowedNodes(ctx context.Context, input SetVPNConnectionAllowedNodesInput) ([]VPNConnectionAllowedNode, error)
|
||||
ListVPNConnectionAllowedNodes(ctx context.Context, clusterID, vpnConnectionID string) ([]VPNConnectionAllowedNode, error)
|
||||
AcquireVPNConnectionLease(ctx context.Context, input AcquireVPNConnectionLeaseInput, expiresAt time.Time, fencingToken string) (VPNConnectionLease, error)
|
||||
RenewVPNConnectionLease(ctx context.Context, input RenewVPNConnectionLeaseInput, expiresAt time.Time) (VPNConnectionLease, error)
|
||||
ReleaseVPNConnectionLease(ctx context.Context, input ReleaseVPNConnectionLeaseInput) (VPNConnectionLease, error)
|
||||
FenceVPNConnectionLease(ctx context.Context, input FenceVPNConnectionLeaseInput) (VPNConnectionLease, error)
|
||||
GetActiveVPNConnectionLease(ctx context.Context, clusterID, vpnConnectionID string) (VPNConnectionLease, error)
|
||||
CheckVPNLeaseOwnerEligibility(ctx context.Context, clusterID, vpnConnectionID, ownerNodeID string) (VPNLeaseOwnerEligibility, error)
|
||||
ExpireStaleVPNConnectionLeases(ctx context.Context, clusterID string, now time.Time) ([]VPNConnectionLease, error)
|
||||
ListNodeVPNAssignments(ctx context.Context, clusterID, nodeID string) ([]NodeVPNAssignment, error)
|
||||
ReportNodeVPNAssignmentStatus(ctx context.Context, input ReportNodeVPNAssignmentStatusInput) (NodeVPNAssignmentStatus, error)
|
||||
|
||||
RecordAudit(ctx context.Context, event ClusterAuditEvent) error
|
||||
ListAuditEvents(ctx context.Context, clusterID string, limit int) ([]ClusterAuditEvent, error)
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,43 @@
|
||||
package cluster
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"crypto/sha256"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
const joinTokenHashPrefix = "sha256:"
|
||||
|
||||
func generateJoinToken() (string, error) {
|
||||
var random [32]byte
|
||||
if _, err := rand.Read(random[:]); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return "rap_join_" + base64.RawURLEncoding.EncodeToString(random[:]), nil
|
||||
}
|
||||
|
||||
func hashJoinToken(token string) (string, error) {
|
||||
trimmed := strings.TrimSpace(token)
|
||||
if trimmed == "" {
|
||||
return "", errors.New("join token is required")
|
||||
}
|
||||
sum := sha256.Sum256([]byte(trimmed))
|
||||
return joinTokenHashPrefix + hex.EncodeToString(sum[:]), nil
|
||||
}
|
||||
|
||||
func isPlatformAdminRole(role string) bool {
|
||||
return role == PlatformRoleAdmin || role == PlatformRoleRecoveryAdmin
|
||||
}
|
||||
|
||||
func isAllowedNodeRole(role string) bool {
|
||||
_, ok := allowedNodeRoles[role]
|
||||
return ok
|
||||
}
|
||||
|
||||
func defaultJoinTokenExpiry(now time.Time) time.Time {
|
||||
return now.Add(30 * time.Minute)
|
||||
}
|
||||
@@ -0,0 +1,344 @@
|
||||
package identitysource
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
"github.com/google/uuid"
|
||||
"github.com/jackc/pgx/v5"
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/httpx"
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/module"
|
||||
)
|
||||
|
||||
type Module struct {
|
||||
db *pgxpool.Pool
|
||||
}
|
||||
|
||||
type IdentitySource struct {
|
||||
ID string `json:"id"`
|
||||
OrganizationID string `json:"organization_id"`
|
||||
Kind string `json:"kind"`
|
||||
Name string `json:"name"`
|
||||
Status string `json:"status"`
|
||||
Config json.RawMessage `json:"config"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type IdentityMapping struct {
|
||||
ID string `json:"id"`
|
||||
IdentitySourceID string `json:"identity_source_id"`
|
||||
MappingType string `json:"mapping_type"`
|
||||
ExternalSelector json.RawMessage `json:"external_selector"`
|
||||
InternalTarget json.RawMessage `json:"internal_target"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type upsertIdentitySourceRequest struct {
|
||||
ActorUserID string `json:"actor_user_id"`
|
||||
OrganizationID string `json:"organization_id"`
|
||||
Kind string `json:"kind"`
|
||||
Name string `json:"name"`
|
||||
Status string `json:"status"`
|
||||
Config json.RawMessage `json:"config"`
|
||||
IdentityMappings []struct {
|
||||
MappingType string `json:"mapping_type"`
|
||||
ExternalSelector json.RawMessage `json:"external_selector"`
|
||||
InternalTarget json.RawMessage `json:"internal_target"`
|
||||
} `json:"identity_mappings"`
|
||||
}
|
||||
|
||||
func NewModule(deps module.Dependencies) *Module {
|
||||
return &Module{db: deps.Infra.DB}
|
||||
}
|
||||
|
||||
func (m *Module) Name() string {
|
||||
return "identitysource"
|
||||
}
|
||||
|
||||
func (m *Module) RegisterRoutes(router chi.Router) {
|
||||
router.Route("/identity-sources", func(r chi.Router) {
|
||||
r.Get("/", m.listIdentitySources)
|
||||
r.Post("/", m.createIdentitySource)
|
||||
r.Get("/{identitySourceID}", m.getIdentitySource)
|
||||
r.Put("/{identitySourceID}", m.updateIdentitySource)
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) listIdentitySources(w http.ResponseWriter, r *http.Request) {
|
||||
orgID := r.URL.Query().Get("organization_id")
|
||||
if orgID == "" {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "organization_id is required")
|
||||
return
|
||||
}
|
||||
rows, err := m.db.Query(r.Context(), `
|
||||
SELECT id, organization_id, kind, name, status, config, created_at, updated_at
|
||||
FROM identity_sources
|
||||
WHERE organization_id = $1
|
||||
ORDER BY created_at DESC
|
||||
`, orgID)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
var items []IdentitySource
|
||||
for rows.Next() {
|
||||
item, err := scanIdentitySource(rows)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
items = append(items, item)
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"identity_sources": items})
|
||||
}
|
||||
|
||||
func (m *Module) getIdentitySource(w http.ResponseWriter, r *http.Request) {
|
||||
id := chi.URLParam(r, "identitySourceID")
|
||||
item, err := m.getByID(r.Context(), id)
|
||||
if err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
httpx.WriteError(w, http.StatusNotFound, "identity source not found")
|
||||
return
|
||||
}
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
mappings, err := m.listMappings(r.Context(), id)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{
|
||||
"identity_source": item,
|
||||
"identity_mappings": mappings,
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) createIdentitySource(w http.ResponseWriter, r *http.Request) {
|
||||
req, err := decodeRequest(r)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
item := IdentitySource{
|
||||
ID: uuid.NewString(),
|
||||
OrganizationID: req.OrganizationID,
|
||||
Kind: req.Kind,
|
||||
Name: req.Name,
|
||||
Status: req.Status,
|
||||
Config: req.Config,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
}
|
||||
tx, err := m.db.Begin(r.Context())
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
defer tx.Rollback(r.Context())
|
||||
if _, err := tx.Exec(r.Context(), `
|
||||
INSERT INTO identity_sources (id, organization_id, kind, name, status, config, created_at, updated_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6::jsonb, $7, $8)
|
||||
`, item.ID, item.OrganizationID, item.Kind, item.Name, item.Status, []byte(item.Config), item.CreatedAt, item.UpdatedAt); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
mappings, err := upsertMappings(r.Context(), tx, item.ID, req.IdentityMappings)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if err := tx.Commit(r.Context()); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusCreated, map[string]any{
|
||||
"identity_source": item,
|
||||
"identity_mappings": mappings,
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) updateIdentitySource(w http.ResponseWriter, r *http.Request) {
|
||||
req, err := decodeRequest(r)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
id := chi.URLParam(r, "identitySourceID")
|
||||
tx, err := m.db.Begin(r.Context())
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
defer tx.Rollback(r.Context())
|
||||
tag, err := tx.Exec(r.Context(), `
|
||||
UPDATE identity_sources
|
||||
SET organization_id = $2, kind = $3, name = $4, status = $5, config = $6::jsonb, updated_at = $7
|
||||
WHERE id = $1
|
||||
`, id, req.OrganizationID, req.Kind, req.Name, req.Status, []byte(req.Config), time.Now().UTC())
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if tag.RowsAffected() == 0 {
|
||||
httpx.WriteError(w, http.StatusNotFound, "identity source not found")
|
||||
return
|
||||
}
|
||||
if _, err := tx.Exec(r.Context(), `DELETE FROM identity_mappings WHERE identity_source_id = $1`, id); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
mappings, err := upsertMappings(r.Context(), tx, id, req.IdentityMappings)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if err := tx.Commit(r.Context()); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
item, err := m.getByID(r.Context(), id)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{
|
||||
"identity_source": item,
|
||||
"identity_mappings": mappings,
|
||||
})
|
||||
}
|
||||
|
||||
func decodeRequest(r *http.Request) (*upsertIdentitySourceRequest, error) {
|
||||
var req upsertIdentitySourceRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
return nil, errors.New("invalid identity source payload")
|
||||
}
|
||||
if req.ActorUserID == "" || req.OrganizationID == "" || req.Kind == "" || req.Name == "" {
|
||||
return nil, errors.New("actor_user_id, organization_id, kind, and name are required")
|
||||
}
|
||||
if req.Status == "" {
|
||||
req.Status = "active"
|
||||
}
|
||||
if len(req.Config) == 0 {
|
||||
req.Config = json.RawMessage(`{}`)
|
||||
}
|
||||
if !json.Valid(req.Config) {
|
||||
return nil, errors.New("config must be valid json")
|
||||
}
|
||||
for _, mapping := range req.IdentityMappings {
|
||||
if len(mapping.ExternalSelector) == 0 {
|
||||
mapping.ExternalSelector = json.RawMessage(`{}`)
|
||||
}
|
||||
if len(mapping.InternalTarget) == 0 {
|
||||
mapping.InternalTarget = json.RawMessage(`{}`)
|
||||
}
|
||||
}
|
||||
return &req, nil
|
||||
}
|
||||
|
||||
func (m *Module) getByID(ctx context.Context, id string) (IdentitySource, error) {
|
||||
row := m.db.QueryRow(ctx, `
|
||||
SELECT id, organization_id, kind, name, status, config, created_at, updated_at
|
||||
FROM identity_sources
|
||||
WHERE id = $1
|
||||
`, id)
|
||||
return scanIdentitySource(row)
|
||||
}
|
||||
|
||||
func (m *Module) listMappings(ctx context.Context, sourceID string) ([]IdentityMapping, error) {
|
||||
rows, err := m.db.Query(ctx, `
|
||||
SELECT id, identity_source_id, mapping_type, external_selector, internal_target, created_at, updated_at
|
||||
FROM identity_mappings
|
||||
WHERE identity_source_id = $1
|
||||
ORDER BY created_at ASC
|
||||
`, sourceID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var mappings []IdentityMapping
|
||||
for rows.Next() {
|
||||
item, err := scanIdentityMapping(rows)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
mappings = append(mappings, item)
|
||||
}
|
||||
return mappings, rows.Err()
|
||||
}
|
||||
|
||||
func upsertMappings(ctx context.Context, tx pgx.Tx, sourceID string, requested []struct {
|
||||
MappingType string `json:"mapping_type"`
|
||||
ExternalSelector json.RawMessage `json:"external_selector"`
|
||||
InternalTarget json.RawMessage `json:"internal_target"`
|
||||
}) ([]IdentityMapping, error) {
|
||||
now := time.Now().UTC()
|
||||
items := make([]IdentityMapping, 0, len(requested))
|
||||
for _, mapping := range requested {
|
||||
external := mapping.ExternalSelector
|
||||
if len(external) == 0 {
|
||||
external = json.RawMessage(`{}`)
|
||||
}
|
||||
internal := mapping.InternalTarget
|
||||
if len(internal) == 0 {
|
||||
internal = json.RawMessage(`{}`)
|
||||
}
|
||||
item := IdentityMapping{
|
||||
ID: uuid.NewString(),
|
||||
IdentitySourceID: sourceID,
|
||||
MappingType: mapping.MappingType,
|
||||
ExternalSelector: external,
|
||||
InternalTarget: internal,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
}
|
||||
if _, err := tx.Exec(ctx, `
|
||||
INSERT INTO identity_mappings (
|
||||
id, identity_source_id, mapping_type, external_selector, internal_target, created_at, updated_at
|
||||
) VALUES ($1, $2, $3, $4::jsonb, $5::jsonb, $6, $7)
|
||||
`, item.ID, item.IdentitySourceID, item.MappingType, []byte(item.ExternalSelector), []byte(item.InternalTarget), item.CreatedAt, item.UpdatedAt); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
items = append(items, item)
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
type rowScanner interface {
|
||||
Scan(dest ...any) error
|
||||
}
|
||||
|
||||
func scanIdentitySource(row rowScanner) (IdentitySource, error) {
|
||||
var item IdentitySource
|
||||
if err := row.Scan(&item.ID, &item.OrganizationID, &item.Kind, &item.Name, &item.Status, &item.Config, &item.CreatedAt, &item.UpdatedAt); err != nil {
|
||||
return IdentitySource{}, err
|
||||
}
|
||||
if len(item.Config) == 0 {
|
||||
item.Config = json.RawMessage(`{}`)
|
||||
}
|
||||
return item, nil
|
||||
}
|
||||
|
||||
func scanIdentityMapping(row rowScanner) (IdentityMapping, error) {
|
||||
var item IdentityMapping
|
||||
if err := row.Scan(&item.ID, &item.IdentitySourceID, &item.MappingType, &item.ExternalSelector, &item.InternalTarget, &item.CreatedAt, &item.UpdatedAt); err != nil {
|
||||
return IdentityMapping{}, err
|
||||
}
|
||||
if len(item.ExternalSelector) == 0 {
|
||||
item.ExternalSelector = json.RawMessage(`{}`)
|
||||
}
|
||||
if len(item.InternalTarget) == 0 {
|
||||
item.InternalTarget = json.RawMessage(`{}`)
|
||||
}
|
||||
return item, nil
|
||||
}
|
||||
@@ -0,0 +1,458 @@
|
||||
package node
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
"github.com/google/uuid"
|
||||
"github.com/jackc/pgx/v5"
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/httpx"
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/module"
|
||||
)
|
||||
|
||||
type Module struct {
|
||||
db *pgxpool.Pool
|
||||
}
|
||||
|
||||
type Node struct {
|
||||
ID string `json:"id"`
|
||||
OwnerOrganizationID *string `json:"owner_organization_id,omitempty"`
|
||||
NodeKey string `json:"node_key"`
|
||||
Name string `json:"name"`
|
||||
OwnershipType string `json:"ownership_type"`
|
||||
RegistrationStatus string `json:"registration_status"`
|
||||
HealthStatus string `json:"health_status"`
|
||||
VersionState string `json:"version_state"`
|
||||
PartitionState string `json:"partition_state"`
|
||||
DesiredVersion *string `json:"desired_version,omitempty"`
|
||||
ReportedVersion *string `json:"reported_version,omitempty"`
|
||||
LastSeenAt *time.Time `json:"last_seen_at,omitempty"`
|
||||
Metadata json.RawMessage `json:"metadata"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type NodeCapability struct {
|
||||
NodeID string `json:"node_id"`
|
||||
Capability string `json:"capability"`
|
||||
Value json.RawMessage `json:"value"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type NodeService struct {
|
||||
NodeID string `json:"node_id"`
|
||||
ServiceType string `json:"service_type"`
|
||||
Enabled bool `json:"enabled"`
|
||||
DesiredState string `json:"desired_state"`
|
||||
ReportedState string `json:"reported_state"`
|
||||
LastReportedAt *time.Time `json:"last_reported_at,omitempty"`
|
||||
Metadata json.RawMessage `json:"metadata"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type NodeUpdatePolicy struct {
|
||||
NodeID string `json:"node_id"`
|
||||
Mode string `json:"mode"`
|
||||
Channel string `json:"channel"`
|
||||
MaintenanceWindow json.RawMessage `json:"maintenance_window"`
|
||||
Canary bool `json:"canary"`
|
||||
AutomaticRollout bool `json:"automatic_rollout"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type NodePartitionState struct {
|
||||
NodeID string `json:"node_id"`
|
||||
ClusterState string `json:"cluster_state"`
|
||||
RecoveryMode string `json:"recovery_mode"`
|
||||
Notes *string `json:"notes,omitempty"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type upsertNodeRequest struct {
|
||||
ActorUserID string `json:"actor_user_id"`
|
||||
OwnerOrganizationID *string `json:"owner_organization_id"`
|
||||
NodeKey string `json:"node_key"`
|
||||
Name string `json:"name"`
|
||||
OwnershipType string `json:"ownership_type"`
|
||||
DesiredVersion *string `json:"desired_version"`
|
||||
Metadata json.RawMessage `json:"metadata"`
|
||||
Capabilities []struct {
|
||||
Capability string `json:"capability"`
|
||||
Value json.RawMessage `json:"value"`
|
||||
} `json:"capabilities"`
|
||||
Services []struct {
|
||||
ServiceType string `json:"service_type"`
|
||||
Enabled bool `json:"enabled"`
|
||||
DesiredState string `json:"desired_state"`
|
||||
Metadata json.RawMessage `json:"metadata"`
|
||||
} `json:"services"`
|
||||
UpdatePolicy struct {
|
||||
Mode string `json:"mode"`
|
||||
Channel string `json:"channel"`
|
||||
MaintenanceWindow json.RawMessage `json:"maintenance_window"`
|
||||
Canary bool `json:"canary"`
|
||||
AutomaticRollout bool `json:"automatic_rollout"`
|
||||
} `json:"update_policy"`
|
||||
PartitionState struct {
|
||||
ClusterState string `json:"cluster_state"`
|
||||
RecoveryMode string `json:"recovery_mode"`
|
||||
Notes *string `json:"notes"`
|
||||
} `json:"partition_state"`
|
||||
}
|
||||
|
||||
func NewModule(deps module.Dependencies) *Module {
|
||||
return &Module{db: deps.Infra.DB}
|
||||
}
|
||||
|
||||
func (m *Module) Name() string {
|
||||
return "node"
|
||||
}
|
||||
|
||||
func (m *Module) RegisterRoutes(router chi.Router) {
|
||||
router.Route("/nodes", func(r chi.Router) {
|
||||
r.Get("/", m.listNodes)
|
||||
r.Post("/", m.createNode)
|
||||
r.Get("/{nodeID}", m.getNode)
|
||||
r.Put("/{nodeID}", m.updateNode)
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) listNodes(w http.ResponseWriter, r *http.Request) {
|
||||
rows, err := m.db.Query(r.Context(), `
|
||||
SELECT id, owner_organization_id, node_key, name, ownership_type, registration_status, health_status,
|
||||
version_state, partition_state, desired_version, reported_version, last_seen_at, metadata, created_at, updated_at
|
||||
FROM nodes
|
||||
ORDER BY created_at DESC
|
||||
`)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
var items []Node
|
||||
for rows.Next() {
|
||||
item, err := scanNode(rows)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
items = append(items, item)
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"nodes": items})
|
||||
}
|
||||
|
||||
func (m *Module) getNode(w http.ResponseWriter, r *http.Request) {
|
||||
nodeID := chi.URLParam(r, "nodeID")
|
||||
item, err := m.getNodeByID(r.Context(), nodeID)
|
||||
if err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
httpx.WriteError(w, http.StatusNotFound, "node not found")
|
||||
return
|
||||
}
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
caps, _ := m.listCapabilities(r.Context(), nodeID)
|
||||
services, _ := m.listServices(r.Context(), nodeID)
|
||||
updatePolicy, _ := m.getUpdatePolicy(r.Context(), nodeID)
|
||||
partitionState, _ := m.getPartitionState(r.Context(), nodeID)
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{
|
||||
"node": item,
|
||||
"capabilities": caps,
|
||||
"services": services,
|
||||
"update_policy": updatePolicy,
|
||||
"partition_state": partitionState,
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) createNode(w http.ResponseWriter, r *http.Request) {
|
||||
req, err := decodeNodeRequest(r)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
item := Node{
|
||||
ID: uuid.NewString(),
|
||||
OwnerOrganizationID: req.OwnerOrganizationID,
|
||||
NodeKey: req.NodeKey,
|
||||
Name: req.Name,
|
||||
OwnershipType: req.OwnershipType,
|
||||
RegistrationStatus: "pending",
|
||||
HealthStatus: "unknown",
|
||||
VersionState: "unknown",
|
||||
PartitionState: "healthy",
|
||||
DesiredVersion: req.DesiredVersion,
|
||||
Metadata: req.Metadata,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
}
|
||||
if err := m.persistNode(r.Context(), item, req, true); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusCreated, map[string]any{"node": item})
|
||||
}
|
||||
|
||||
func (m *Module) updateNode(w http.ResponseWriter, r *http.Request) {
|
||||
req, err := decodeNodeRequest(r)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
nodeID := chi.URLParam(r, "nodeID")
|
||||
item, err := m.getNodeByID(r.Context(), nodeID)
|
||||
if err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
httpx.WriteError(w, http.StatusNotFound, "node not found")
|
||||
return
|
||||
}
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
item.OwnerOrganizationID = req.OwnerOrganizationID
|
||||
item.NodeKey = req.NodeKey
|
||||
item.Name = req.Name
|
||||
item.OwnershipType = req.OwnershipType
|
||||
item.DesiredVersion = req.DesiredVersion
|
||||
item.Metadata = req.Metadata
|
||||
item.UpdatedAt = time.Now().UTC()
|
||||
if err := m.persistNode(r.Context(), item, req, false); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"node": item})
|
||||
}
|
||||
|
||||
func decodeNodeRequest(r *http.Request) (*upsertNodeRequest, error) {
|
||||
var req upsertNodeRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
return nil, errors.New("invalid node payload")
|
||||
}
|
||||
if req.ActorUserID == "" || req.NodeKey == "" || req.Name == "" || req.OwnershipType == "" {
|
||||
return nil, errors.New("actor_user_id, node_key, name, and ownership_type are required")
|
||||
}
|
||||
if len(req.Metadata) == 0 {
|
||||
req.Metadata = json.RawMessage(`{}`)
|
||||
}
|
||||
if !json.Valid(req.Metadata) {
|
||||
return nil, errors.New("metadata must be valid json")
|
||||
}
|
||||
if req.UpdatePolicy.Mode == "" {
|
||||
req.UpdatePolicy.Mode = "manual"
|
||||
}
|
||||
if req.UpdatePolicy.Channel == "" {
|
||||
req.UpdatePolicy.Channel = "stable"
|
||||
}
|
||||
if len(req.UpdatePolicy.MaintenanceWindow) == 0 {
|
||||
req.UpdatePolicy.MaintenanceWindow = json.RawMessage(`{}`)
|
||||
}
|
||||
if req.PartitionState.ClusterState == "" {
|
||||
req.PartitionState.ClusterState = "healthy"
|
||||
}
|
||||
if req.PartitionState.RecoveryMode == "" {
|
||||
req.PartitionState.RecoveryMode = "normal"
|
||||
}
|
||||
for i := range req.Capabilities {
|
||||
if len(req.Capabilities[i].Value) == 0 {
|
||||
req.Capabilities[i].Value = json.RawMessage(`{}`)
|
||||
}
|
||||
}
|
||||
for i := range req.Services {
|
||||
if req.Services[i].DesiredState == "" {
|
||||
req.Services[i].DesiredState = "disabled"
|
||||
}
|
||||
if len(req.Services[i].Metadata) == 0 {
|
||||
req.Services[i].Metadata = json.RawMessage(`{}`)
|
||||
}
|
||||
}
|
||||
return &req, nil
|
||||
}
|
||||
|
||||
func (m *Module) persistNode(ctx context.Context, item Node, req *upsertNodeRequest, create bool) error {
|
||||
tx, err := m.db.Begin(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer tx.Rollback(ctx)
|
||||
if create {
|
||||
_, err = tx.Exec(ctx, `
|
||||
INSERT INTO nodes (
|
||||
id, owner_organization_id, node_key, name, ownership_type, registration_status, health_status,
|
||||
version_state, partition_state, desired_version, reported_version, last_seen_at, metadata, created_at, updated_at
|
||||
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13::jsonb,$14,$15)
|
||||
`, item.ID, item.OwnerOrganizationID, item.NodeKey, item.Name, item.OwnershipType, item.RegistrationStatus, item.HealthStatus, item.VersionState, item.PartitionState, item.DesiredVersion, item.ReportedVersion, item.LastSeenAt, []byte(item.Metadata), item.CreatedAt, item.UpdatedAt)
|
||||
} else {
|
||||
_, err = tx.Exec(ctx, `
|
||||
UPDATE nodes
|
||||
SET owner_organization_id=$2, node_key=$3, name=$4, ownership_type=$5, desired_version=$6, metadata=$7::jsonb, updated_at=$8
|
||||
WHERE id=$1
|
||||
`, item.ID, item.OwnerOrganizationID, item.NodeKey, item.Name, item.OwnershipType, item.DesiredVersion, []byte(item.Metadata), item.UpdatedAt)
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := tx.Exec(ctx, `DELETE FROM node_capabilities WHERE node_id = $1`, item.ID); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, capability := range req.Capabilities {
|
||||
if _, err := tx.Exec(ctx, `
|
||||
INSERT INTO node_capabilities (node_id, capability, value, updated_at)
|
||||
VALUES ($1, $2, $3::jsonb, $4)
|
||||
`, item.ID, capability.Capability, []byte(capability.Value), time.Now().UTC()); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if _, err := tx.Exec(ctx, `DELETE FROM node_services WHERE node_id = $1`, item.ID); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, service := range req.Services {
|
||||
if _, err := tx.Exec(ctx, `
|
||||
INSERT INTO node_services (
|
||||
node_id, service_type, enabled, desired_state, reported_state, last_reported_at, metadata, updated_at
|
||||
) VALUES ($1, $2, $3, $4, 'unknown', NULL, $5::jsonb, $6)
|
||||
`, item.ID, service.ServiceType, service.Enabled, service.DesiredState, []byte(service.Metadata), time.Now().UTC()); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if _, err := tx.Exec(ctx, `
|
||||
INSERT INTO node_update_policies (
|
||||
node_id, mode, channel, maintenance_window, canary, automatic_rollout, created_at, updated_at
|
||||
) VALUES ($1,$2,$3,$4::jsonb,$5,$6,$7,$8)
|
||||
ON CONFLICT (node_id) DO UPDATE SET
|
||||
mode = EXCLUDED.mode,
|
||||
channel = EXCLUDED.channel,
|
||||
maintenance_window = EXCLUDED.maintenance_window,
|
||||
canary = EXCLUDED.canary,
|
||||
automatic_rollout = EXCLUDED.automatic_rollout,
|
||||
updated_at = EXCLUDED.updated_at
|
||||
`, item.ID, req.UpdatePolicy.Mode, req.UpdatePolicy.Channel, []byte(req.UpdatePolicy.MaintenanceWindow), req.UpdatePolicy.Canary, req.UpdatePolicy.AutomaticRollout, time.Now().UTC(), time.Now().UTC()); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := tx.Exec(ctx, `
|
||||
INSERT INTO node_partition_states (node_id, cluster_state, recovery_mode, notes, updated_at)
|
||||
VALUES ($1,$2,$3,$4,$5)
|
||||
ON CONFLICT (node_id) DO UPDATE SET
|
||||
cluster_state = EXCLUDED.cluster_state,
|
||||
recovery_mode = EXCLUDED.recovery_mode,
|
||||
notes = EXCLUDED.notes,
|
||||
updated_at = EXCLUDED.updated_at
|
||||
`, item.ID, req.PartitionState.ClusterState, req.PartitionState.RecoveryMode, req.PartitionState.Notes, time.Now().UTC()); err != nil {
|
||||
return err
|
||||
}
|
||||
return tx.Commit(ctx)
|
||||
}
|
||||
|
||||
func (m *Module) getNodeByID(ctx context.Context, nodeID string) (Node, error) {
|
||||
row := m.db.QueryRow(ctx, `
|
||||
SELECT id, owner_organization_id, node_key, name, ownership_type, registration_status, health_status,
|
||||
version_state, partition_state, desired_version, reported_version, last_seen_at, metadata, created_at, updated_at
|
||||
FROM nodes
|
||||
WHERE id = $1
|
||||
`, nodeID)
|
||||
return scanNode(row)
|
||||
}
|
||||
|
||||
func (m *Module) listCapabilities(ctx context.Context, nodeID string) ([]NodeCapability, error) {
|
||||
rows, err := m.db.Query(ctx, `SELECT node_id, capability, value, updated_at FROM node_capabilities WHERE node_id = $1 ORDER BY capability`, nodeID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var out []NodeCapability
|
||||
for rows.Next() {
|
||||
var item NodeCapability
|
||||
if err := rows.Scan(&item.NodeID, &item.Capability, &item.Value, &item.UpdatedAt); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out = append(out, item)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
func (m *Module) listServices(ctx context.Context, nodeID string) ([]NodeService, error) {
|
||||
rows, err := m.db.Query(ctx, `
|
||||
SELECT node_id, service_type, enabled, desired_state, reported_state, last_reported_at, metadata, updated_at
|
||||
FROM node_services WHERE node_id = $1 ORDER BY service_type
|
||||
`, nodeID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var out []NodeService
|
||||
for rows.Next() {
|
||||
var item NodeService
|
||||
if err := rows.Scan(&item.NodeID, &item.ServiceType, &item.Enabled, &item.DesiredState, &item.ReportedState, &item.LastReportedAt, &item.Metadata, &item.UpdatedAt); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out = append(out, item)
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
func (m *Module) getUpdatePolicy(ctx context.Context, nodeID string) (*NodeUpdatePolicy, error) {
|
||||
row := m.db.QueryRow(ctx, `
|
||||
SELECT node_id, mode, channel, maintenance_window, canary, automatic_rollout, created_at, updated_at
|
||||
FROM node_update_policies WHERE node_id = $1
|
||||
`, nodeID)
|
||||
var item NodeUpdatePolicy
|
||||
if err := row.Scan(&item.NodeID, &item.Mode, &item.Channel, &item.MaintenanceWindow, &item.Canary, &item.AutomaticRollout, &item.CreatedAt, &item.UpdatedAt); err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
return &item, nil
|
||||
}
|
||||
|
||||
func (m *Module) getPartitionState(ctx context.Context, nodeID string) (*NodePartitionState, error) {
|
||||
row := m.db.QueryRow(ctx, `
|
||||
SELECT node_id, cluster_state, recovery_mode, notes, updated_at
|
||||
FROM node_partition_states WHERE node_id = $1
|
||||
`, nodeID)
|
||||
var item NodePartitionState
|
||||
if err := row.Scan(&item.NodeID, &item.ClusterState, &item.RecoveryMode, &item.Notes, &item.UpdatedAt); err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
return &item, nil
|
||||
}
|
||||
|
||||
type rowScanner interface {
|
||||
Scan(dest ...any) error
|
||||
}
|
||||
|
||||
func scanNode(row rowScanner) (Node, error) {
|
||||
var item Node
|
||||
if err := row.Scan(
|
||||
&item.ID,
|
||||
&item.OwnerOrganizationID,
|
||||
&item.NodeKey,
|
||||
&item.Name,
|
||||
&item.OwnershipType,
|
||||
&item.RegistrationStatus,
|
||||
&item.HealthStatus,
|
||||
&item.VersionState,
|
||||
&item.PartitionState,
|
||||
&item.DesiredVersion,
|
||||
&item.ReportedVersion,
|
||||
&item.LastSeenAt,
|
||||
&item.Metadata,
|
||||
&item.CreatedAt,
|
||||
&item.UpdatedAt,
|
||||
); err != nil {
|
||||
return Node{}, err
|
||||
}
|
||||
if len(item.Metadata) == 0 {
|
||||
item.Metadata = json.RawMessage(`{}`)
|
||||
}
|
||||
return item, nil
|
||||
}
|
||||
@@ -0,0 +1,356 @@
|
||||
package nodeagent
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
"github.com/google/uuid"
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
|
||||
clustermodule "github.com/example/remote-access-platform/backend/internal/modules/cluster"
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/httpx"
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/module"
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/secrets"
|
||||
)
|
||||
|
||||
type Module struct {
|
||||
db *pgxpool.Pool
|
||||
cluster *clustermodule.Service
|
||||
}
|
||||
|
||||
func NewModule(deps module.Dependencies) *Module {
|
||||
clusterStore := clustermodule.NewPostgresStore(deps.Infra.DB)
|
||||
if deps.Config.Secret.EncryptionKeyBase64 != "" {
|
||||
if encryptor, err := secrets.NewEncryptor(deps.Config.Secret.EncryptionKeyBase64, deps.Config.Secret.EncryptionKeyID); err == nil {
|
||||
clusterStore.WithClusterKeyEncryptor(encryptor)
|
||||
}
|
||||
}
|
||||
return &Module{
|
||||
db: deps.Infra.DB,
|
||||
cluster: clustermodule.NewService(clusterStore),
|
||||
}
|
||||
}
|
||||
|
||||
func (m *Module) Name() string {
|
||||
return "nodeagent"
|
||||
}
|
||||
|
||||
func (m *Module) RegisterRoutes(router chi.Router) {
|
||||
router.Route("/node-agents", func(r chi.Router) {
|
||||
r.Post("/enroll", m.enrollAgent)
|
||||
r.Post("/enrollments/{requestID}/bootstrap", m.bootstrapEnrollment)
|
||||
r.Post("/register", m.registerAgent)
|
||||
r.Post("/{nodeID}/health", m.reportHealth)
|
||||
r.Post("/{nodeID}/services/status", m.reportServiceStatus)
|
||||
r.Post("/{nodeID}/update-manifest/request", m.requestUpdateManifest)
|
||||
r.Post("/{nodeID}/update-result", m.acknowledgeUpdateResult)
|
||||
r.Post("/{nodeID}/rollback-result", m.reportRollbackResult)
|
||||
r.Get("/{nodeID}/clusters/{clusterID}/vpn-assignments/desired", m.listVPNAssignments)
|
||||
r.Post("/{nodeID}/clusters/{clusterID}/vpn-assignments/{vpnConnectionID}/status", m.reportVPNAssignmentStatus)
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) enrollAgent(w http.ResponseWriter, r *http.Request) {
|
||||
var payload struct {
|
||||
ClusterID string `json:"cluster_id"`
|
||||
JoinToken string `json:"join_token"`
|
||||
NodeName string `json:"node_name"`
|
||||
NodeFingerprint string `json:"node_fingerprint"`
|
||||
PublicKey string `json:"public_key"`
|
||||
ReportedCapabilities json.RawMessage `json:"reported_capabilities"`
|
||||
ReportedFacts json.RawMessage `json:"reported_facts"`
|
||||
RequestedRoles json.RawMessage `json:"requested_roles"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid agent enrollment payload")
|
||||
return
|
||||
}
|
||||
joinRequest, err := m.cluster.CreateJoinRequest(r.Context(), clustermodule.CreateJoinRequestInput{
|
||||
ClusterID: payload.ClusterID,
|
||||
JoinToken: payload.JoinToken,
|
||||
NodeName: payload.NodeName,
|
||||
NodeFingerprint: payload.NodeFingerprint,
|
||||
PublicKey: payload.PublicKey,
|
||||
ReportedCapabilities: payload.ReportedCapabilities,
|
||||
ReportedFacts: payload.ReportedFacts,
|
||||
RequestedRoles: payload.RequestedRoles,
|
||||
})
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{
|
||||
"status": "pending_approval",
|
||||
"join_request": joinRequest,
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) bootstrapEnrollment(w http.ResponseWriter, r *http.Request) {
|
||||
var payload struct {
|
||||
ClusterID string `json:"cluster_id"`
|
||||
NodeFingerprint string `json:"node_fingerprint"`
|
||||
PublicKey string `json:"public_key"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid enrollment bootstrap payload")
|
||||
return
|
||||
}
|
||||
result, err := m.cluster.GetJoinRequestBootstrap(r.Context(), clustermodule.GetJoinRequestBootstrapInput{
|
||||
ClusterID: payload.ClusterID,
|
||||
JoinRequestID: chi.URLParam(r, "requestID"),
|
||||
NodeFingerprint: payload.NodeFingerprint,
|
||||
PublicKey: payload.PublicKey,
|
||||
})
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, result)
|
||||
}
|
||||
|
||||
func (m *Module) registerAgent(w http.ResponseWriter, r *http.Request) {
|
||||
var payload struct {
|
||||
NodeKey string `json:"node_key"`
|
||||
Name string `json:"name"`
|
||||
OwnershipType string `json:"ownership_type"`
|
||||
OwnerOrganizationID *string `json:"owner_organization_id"`
|
||||
ReportedVersion *string `json:"reported_version"`
|
||||
Metadata json.RawMessage `json:"metadata"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid agent registration payload")
|
||||
return
|
||||
}
|
||||
if payload.NodeKey == "" || payload.Name == "" || payload.OwnershipType == "" {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "node_key, name, and ownership_type are required")
|
||||
return
|
||||
}
|
||||
if len(payload.Metadata) == 0 {
|
||||
payload.Metadata = json.RawMessage(`{}`)
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
nodeID := uuid.NewString()
|
||||
if err := m.db.QueryRow(r.Context(), `
|
||||
INSERT INTO nodes (
|
||||
id, owner_organization_id, node_key, name, ownership_type, registration_status, health_status,
|
||||
version_state, partition_state, desired_version, reported_version, last_seen_at, metadata, created_at, updated_at
|
||||
) VALUES ($1, $2, $3, $4, $5, 'active', 'unknown', 'unknown', 'healthy', NULL, $6, $7, $8::jsonb, $9, $10)
|
||||
ON CONFLICT (node_key) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
ownership_type = EXCLUDED.ownership_type,
|
||||
owner_organization_id = EXCLUDED.owner_organization_id,
|
||||
registration_status = 'active',
|
||||
reported_version = EXCLUDED.reported_version,
|
||||
last_seen_at = EXCLUDED.last_seen_at,
|
||||
metadata = EXCLUDED.metadata,
|
||||
updated_at = EXCLUDED.updated_at
|
||||
RETURNING id
|
||||
`, nodeID, payload.OwnerOrganizationID, payload.NodeKey, payload.Name, payload.OwnershipType, payload.ReportedVersion, now, []byte(payload.Metadata), now, now).Scan(&nodeID); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{
|
||||
"node_id": nodeID,
|
||||
"status": "registered",
|
||||
"legacy": true,
|
||||
"warning": "direct node-agent registration is retained for compatibility; production enrollment must use /node-agents/enroll",
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) reportHealth(w http.ResponseWriter, r *http.Request) {
|
||||
var payload struct {
|
||||
HealthStatus string `json:"health_status"`
|
||||
ReportedVersion *string `json:"reported_version"`
|
||||
Metadata json.RawMessage `json:"metadata"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid node health payload")
|
||||
return
|
||||
}
|
||||
if payload.HealthStatus == "" {
|
||||
payload.HealthStatus = "unknown"
|
||||
}
|
||||
if len(payload.Metadata) == 0 {
|
||||
payload.Metadata = json.RawMessage(`{}`)
|
||||
}
|
||||
if _, err := m.db.Exec(r.Context(), `
|
||||
UPDATE nodes
|
||||
SET health_status = $2, reported_version = COALESCE($3, reported_version), last_seen_at = $4, metadata = $5::jsonb, updated_at = $4
|
||||
WHERE id = $1
|
||||
`, chi.URLParam(r, "nodeID"), payload.HealthStatus, payload.ReportedVersion, time.Now().UTC(), []byte(payload.Metadata)); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{"status": "accepted"})
|
||||
}
|
||||
|
||||
func (m *Module) reportServiceStatus(w http.ResponseWriter, r *http.Request) {
|
||||
var payload struct {
|
||||
Services []struct {
|
||||
ServiceType string `json:"service_type"`
|
||||
ReportedState string `json:"reported_state"`
|
||||
Metadata json.RawMessage `json:"metadata"`
|
||||
} `json:"services"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid node service status payload")
|
||||
return
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
for _, service := range payload.Services {
|
||||
if len(service.Metadata) == 0 {
|
||||
service.Metadata = json.RawMessage(`{}`)
|
||||
}
|
||||
if _, err := m.db.Exec(r.Context(), `
|
||||
INSERT INTO node_services (
|
||||
node_id, service_type, enabled, desired_state, reported_state, last_reported_at, metadata, updated_at
|
||||
) VALUES ($1, $2, FALSE, 'disabled', $3, $4, $5::jsonb, $4)
|
||||
ON CONFLICT (node_id, service_type) DO UPDATE SET
|
||||
reported_state = EXCLUDED.reported_state,
|
||||
last_reported_at = EXCLUDED.last_reported_at,
|
||||
metadata = EXCLUDED.metadata,
|
||||
updated_at = EXCLUDED.updated_at
|
||||
`, chi.URLParam(r, "nodeID"), service.ServiceType, service.ReportedState, now, []byte(service.Metadata)); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{"status": "accepted"})
|
||||
}
|
||||
|
||||
func (m *Module) listVPNAssignments(w http.ResponseWriter, r *http.Request) {
|
||||
items, err := m.cluster.ListNodeVPNAssignments(r.Context(), chi.URLParam(r, "clusterID"), chi.URLParam(r, "nodeID"))
|
||||
if writeClusterServiceError(w, err) {
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{
|
||||
"vpn_assignments": items,
|
||||
"runtime_execution_enabled": false,
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) reportVPNAssignmentStatus(w http.ResponseWriter, r *http.Request) {
|
||||
var payload struct {
|
||||
ObservedStatus string `json:"observed_status"`
|
||||
StatusPayload json.RawMessage `json:"status_payload"`
|
||||
ObservedAt *time.Time `json:"observed_at"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid vpn assignment status payload")
|
||||
return
|
||||
}
|
||||
observedAt := time.Time{}
|
||||
if payload.ObservedAt != nil {
|
||||
observedAt = *payload.ObservedAt
|
||||
}
|
||||
item, err := m.cluster.ReportNodeVPNAssignmentStatus(r.Context(), clustermodule.ReportNodeVPNAssignmentStatusInput{
|
||||
ClusterID: chi.URLParam(r, "clusterID"),
|
||||
NodeID: chi.URLParam(r, "nodeID"),
|
||||
VPNConnectionID: chi.URLParam(r, "vpnConnectionID"),
|
||||
ObservedStatus: payload.ObservedStatus,
|
||||
StatusPayload: payload.StatusPayload,
|
||||
ObservedAt: observedAt,
|
||||
})
|
||||
if writeClusterServiceError(w, err) {
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{
|
||||
"vpn_assignment_status": item,
|
||||
"runtime_execution_enabled": false,
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) requestUpdateManifest(w http.ResponseWriter, r *http.Request) {
|
||||
nodeID := chi.URLParam(r, "nodeID")
|
||||
var mode, channel string
|
||||
var canary, automatic bool
|
||||
var desiredVersion *string
|
||||
if err := m.db.QueryRow(r.Context(), `
|
||||
SELECT n.desired_version, p.mode, p.channel, p.canary, p.automatic_rollout
|
||||
FROM nodes n
|
||||
LEFT JOIN node_update_policies p ON p.node_id = n.id
|
||||
WHERE n.id = $1
|
||||
`, nodeID).Scan(&desiredVersion, &mode, &channel, &canary, &automatic); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{
|
||||
"manifest": map[string]any{
|
||||
"node_id": nodeID,
|
||||
"desired_version": desiredVersion,
|
||||
"mode": mode,
|
||||
"channel": channel,
|
||||
"canary": canary,
|
||||
"automatic_rollout": automatic,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) acknowledgeUpdateResult(w http.ResponseWriter, r *http.Request) {
|
||||
m.recordUpdateRun(w, r, "update")
|
||||
}
|
||||
|
||||
func (m *Module) reportRollbackResult(w http.ResponseWriter, r *http.Request) {
|
||||
m.recordUpdateRun(w, r, "rollback")
|
||||
}
|
||||
|
||||
func (m *Module) recordUpdateRun(w http.ResponseWriter, r *http.Request, action string) {
|
||||
var payload struct {
|
||||
TargetVersion string `json:"target_version"`
|
||||
Status string `json:"status"`
|
||||
Payload json.RawMessage `json:"payload"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid update result payload")
|
||||
return
|
||||
}
|
||||
if payload.Status == "" {
|
||||
payload.Status = "acknowledged"
|
||||
}
|
||||
if len(payload.Payload) == 0 {
|
||||
payload.Payload = json.RawMessage(`{}`)
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
runID := uuid.NewString()
|
||||
if _, err := m.db.Exec(r.Context(), `
|
||||
INSERT INTO node_agent_update_runs (
|
||||
id, node_id, action, target_version, status, requested_at, acknowledged_at, completed_at, payload
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $6, CASE WHEN $5 IN ('succeeded', 'failed') THEN $6 ELSE NULL END, $7::jsonb)
|
||||
`, runID, chi.URLParam(r, "nodeID"), action, payload.TargetVersion, payload.Status, now, []byte(payload.Payload)); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if action == "update" && payload.Status == "succeeded" {
|
||||
_, _ = m.db.Exec(r.Context(), `
|
||||
UPDATE nodes
|
||||
SET reported_version = $2, version_state = 'current', updated_at = $3
|
||||
WHERE id = $1
|
||||
`, chi.URLParam(r, "nodeID"), payload.TargetVersion, now)
|
||||
}
|
||||
if action == "rollback" && payload.Status == "succeeded" {
|
||||
_, _ = m.db.Exec(r.Context(), `
|
||||
UPDATE nodes
|
||||
SET reported_version = $2, version_state = 'rollback', updated_at = $3
|
||||
WHERE id = $1
|
||||
`, chi.URLParam(r, "nodeID"), payload.TargetVersion, now)
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{"status": "accepted", "run_id": runID})
|
||||
}
|
||||
|
||||
func writeClusterServiceError(w http.ResponseWriter, err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
switch {
|
||||
case errors.Is(err, clustermodule.ErrVPNLeaseOwnerNotAllowed), errors.Is(err, clustermodule.ErrVPNLeaseOwnerRoleRequired):
|
||||
httpx.WriteError(w, http.StatusForbidden, err.Error())
|
||||
case errors.Is(err, clustermodule.ErrInvalidPayload):
|
||||
httpx.WriteError(w, http.StatusBadRequest, err.Error())
|
||||
default:
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
}
|
||||
return true
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
package organization
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestTenantSafeTopologyExposureDoesNotExposeCoreMesh(t *testing.T) {
|
||||
value := tenantSafeTopologyExposure()
|
||||
forbidden := []string{
|
||||
"core_node_id",
|
||||
"mesh_route",
|
||||
"cluster_private_topology",
|
||||
"certificate_serial",
|
||||
}
|
||||
for _, token := range forbidden {
|
||||
if value == token {
|
||||
t.Fatalf("topology exposure leaked forbidden token %q", token)
|
||||
}
|
||||
}
|
||||
if value != "tenant_safe_no_core_mesh_topology" {
|
||||
t.Fatalf("unexpected topology exposure marker: %q", value)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,518 @@
|
||||
package organization
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
"github.com/google/uuid"
|
||||
"github.com/jackc/pgx/v5"
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/authority"
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/httpx"
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/module"
|
||||
)
|
||||
|
||||
const (
|
||||
RoleOrgOwner = "org_owner"
|
||||
RoleOrgAdmin = "org_admin"
|
||||
RoleOrgOperator = "org_operator"
|
||||
RoleOrgMember = "org_member"
|
||||
RoleOrgViewer = "org_viewer"
|
||||
)
|
||||
|
||||
type Module struct {
|
||||
db *pgxpool.Pool
|
||||
authority *authority.Verifier
|
||||
}
|
||||
|
||||
type Organization struct {
|
||||
ID string `json:"id"`
|
||||
Slug string `json:"slug"`
|
||||
Name string `json:"name"`
|
||||
Status string `json:"status"`
|
||||
Metadata json.RawMessage `json:"metadata"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type Membership struct {
|
||||
ID string `json:"id"`
|
||||
OrganizationID string `json:"organization_id"`
|
||||
UserID string `json:"user_id"`
|
||||
RoleID string `json:"role_id"`
|
||||
Status string `json:"status"`
|
||||
InvitedByUser *string `json:"invited_by_user_id,omitempty"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type AdminSummary struct {
|
||||
OrganizationID string `json:"organization_id"`
|
||||
ResourceCount int64 `json:"resource_count"`
|
||||
ActiveSessionCount int64 `json:"active_session_count"`
|
||||
ServiceEndpoints []ServiceSummary `json:"service_endpoints"`
|
||||
ConnectorStatus map[string]any `json:"connector_status"`
|
||||
RecentAudit []OrgAuditEvent `json:"recent_audit"`
|
||||
TopologyExposure string `json:"topology_exposure"`
|
||||
}
|
||||
|
||||
type ServiceSummary struct {
|
||||
Protocol string `json:"protocol"`
|
||||
Count int64 `json:"count"`
|
||||
}
|
||||
|
||||
type OrgAuditEvent struct {
|
||||
ID string `json:"id"`
|
||||
EventType string `json:"event_type"`
|
||||
TargetType string `json:"target_type"`
|
||||
TargetID string `json:"target_id"`
|
||||
Payload json.RawMessage `json:"payload"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
}
|
||||
|
||||
type createOrganizationRequest struct {
|
||||
ActorUserID string `json:"actor_user_id"`
|
||||
Slug string `json:"slug"`
|
||||
Name string `json:"name"`
|
||||
Metadata json.RawMessage `json:"metadata"`
|
||||
}
|
||||
|
||||
type addMembershipRequest struct {
|
||||
ActorUserID string `json:"actor_user_id"`
|
||||
UserID string `json:"user_id"`
|
||||
RoleID string `json:"role_id"`
|
||||
}
|
||||
|
||||
func NewModule(deps module.Dependencies) *Module {
|
||||
authorityVerifier, _ := authority.NewVerifier(deps.Config.Installation)
|
||||
return &Module{db: deps.Infra.DB, authority: authorityVerifier}
|
||||
}
|
||||
|
||||
func (m *Module) Name() string {
|
||||
return "organization"
|
||||
}
|
||||
|
||||
func (m *Module) RegisterRoutes(router chi.Router) {
|
||||
router.Route("/organizations", func(r chi.Router) {
|
||||
r.Get("/", m.listOrganizations)
|
||||
r.Post("/", m.createOrganization)
|
||||
r.Get("/{organizationID}", m.getOrganization)
|
||||
r.Get("/{organizationID}/admin-summary", m.getAdminSummary)
|
||||
r.Get("/{organizationID}/memberships", m.listMemberships)
|
||||
r.Post("/{organizationID}/memberships", m.addMembership)
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) listOrganizations(w http.ResponseWriter, r *http.Request) {
|
||||
userID := r.URL.Query().Get("user_id")
|
||||
if userID == "" {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "user_id is required")
|
||||
return
|
||||
}
|
||||
platformRole, err := m.getPlatformRole(r.Context(), userID)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
var rows pgx.Rows
|
||||
if isPlatformAdmin(platformRole) {
|
||||
rows, err = m.db.Query(r.Context(), `
|
||||
SELECT id, slug, name, status, metadata, created_at, updated_at
|
||||
FROM organizations
|
||||
ORDER BY created_at DESC
|
||||
`)
|
||||
} else {
|
||||
rows, err = m.db.Query(r.Context(), `
|
||||
SELECT o.id, o.slug, o.name, o.status, o.metadata, o.created_at, o.updated_at
|
||||
FROM organizations o
|
||||
INNER JOIN organization_memberships om ON om.organization_id = o.id
|
||||
WHERE om.user_id = $1 AND om.status = 'active'
|
||||
ORDER BY o.created_at DESC
|
||||
`, userID)
|
||||
}
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
var organizations []Organization
|
||||
for rows.Next() {
|
||||
org, err := scanOrganization(rows)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
organizations = append(organizations, org)
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"organizations": organizations})
|
||||
}
|
||||
|
||||
func (m *Module) getOrganization(w http.ResponseWriter, r *http.Request) {
|
||||
orgID := chi.URLParam(r, "organizationID")
|
||||
userID := r.URL.Query().Get("user_id")
|
||||
if userID == "" {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "user_id is required")
|
||||
return
|
||||
}
|
||||
if err := m.ensureOrgAccess(r.Context(), orgID, userID, false); err != nil {
|
||||
status := http.StatusInternalServerError
|
||||
if errors.Is(err, pgx.ErrNoRows) || errors.Is(err, errForbidden) {
|
||||
status = http.StatusForbidden
|
||||
}
|
||||
httpx.WriteError(w, status, err.Error())
|
||||
return
|
||||
}
|
||||
org, err := m.getOrganizationByID(r.Context(), orgID)
|
||||
if err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
httpx.WriteError(w, http.StatusNotFound, "organization not found")
|
||||
return
|
||||
}
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"organization": org})
|
||||
}
|
||||
|
||||
func (m *Module) getAdminSummary(w http.ResponseWriter, r *http.Request) {
|
||||
orgID := chi.URLParam(r, "organizationID")
|
||||
actorUserID := r.URL.Query().Get("actor_user_id")
|
||||
if actorUserID == "" {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "actor_user_id is required")
|
||||
return
|
||||
}
|
||||
if err := m.ensureOrgAccess(r.Context(), orgID, actorUserID, true); err != nil {
|
||||
httpx.WriteError(w, http.StatusForbidden, err.Error())
|
||||
return
|
||||
}
|
||||
summary, err := m.loadAdminSummary(r.Context(), orgID)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"admin_summary": summary})
|
||||
}
|
||||
|
||||
func (m *Module) loadAdminSummary(ctx context.Context, orgID string) (AdminSummary, error) {
|
||||
var resourceCount int64
|
||||
if err := m.db.QueryRow(ctx, `
|
||||
SELECT COUNT(*)
|
||||
FROM resources
|
||||
WHERE organization_id = $1::uuid
|
||||
`, orgID).Scan(&resourceCount); err != nil {
|
||||
return AdminSummary{}, err
|
||||
}
|
||||
|
||||
var activeSessionCount int64
|
||||
if err := m.db.QueryRow(ctx, `
|
||||
SELECT COUNT(*)
|
||||
FROM remote_sessions
|
||||
WHERE organization_id = $1::uuid
|
||||
AND state = 'active'
|
||||
`, orgID).Scan(&activeSessionCount); err != nil {
|
||||
return AdminSummary{}, err
|
||||
}
|
||||
|
||||
rows, err := m.db.Query(ctx, `
|
||||
SELECT protocol, COUNT(*)
|
||||
FROM resources
|
||||
WHERE organization_id = $1::uuid
|
||||
GROUP BY protocol
|
||||
ORDER BY protocol
|
||||
`, orgID)
|
||||
if err != nil {
|
||||
return AdminSummary{}, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var services []ServiceSummary
|
||||
for rows.Next() {
|
||||
var item ServiceSummary
|
||||
if err := rows.Scan(&item.Protocol, &item.Count); err != nil {
|
||||
return AdminSummary{}, err
|
||||
}
|
||||
services = append(services, item)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
return AdminSummary{}, err
|
||||
}
|
||||
|
||||
auditRows, err := m.db.Query(ctx, `
|
||||
SELECT ae.id::text, ae.event_type, ae.target_type, ae.target_id, ae.payload, ae.created_at
|
||||
FROM audit_events ae
|
||||
LEFT JOIN remote_sessions rs ON rs.id = ae.remote_session_id
|
||||
WHERE rs.organization_id = $1::uuid
|
||||
ORDER BY ae.created_at DESC
|
||||
LIMIT 20
|
||||
`, orgID)
|
||||
if err != nil {
|
||||
return AdminSummary{}, err
|
||||
}
|
||||
defer auditRows.Close()
|
||||
var audit []OrgAuditEvent
|
||||
for auditRows.Next() {
|
||||
var item OrgAuditEvent
|
||||
if err := auditRows.Scan(&item.ID, &item.EventType, &item.TargetType, &item.TargetID, &item.Payload, &item.CreatedAt); err != nil {
|
||||
return AdminSummary{}, err
|
||||
}
|
||||
audit = append(audit, item)
|
||||
}
|
||||
if err := auditRows.Err(); err != nil {
|
||||
return AdminSummary{}, err
|
||||
}
|
||||
|
||||
return AdminSummary{
|
||||
OrganizationID: orgID,
|
||||
ResourceCount: resourceCount,
|
||||
ActiveSessionCount: activeSessionCount,
|
||||
ServiceEndpoints: services,
|
||||
ConnectorStatus: map[string]any{
|
||||
"vpn": "not_implemented",
|
||||
"connector": "not_implemented",
|
||||
},
|
||||
RecentAudit: audit,
|
||||
TopologyExposure: tenantSafeTopologyExposure(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func tenantSafeTopologyExposure() string {
|
||||
return "tenant_safe_no_core_mesh_topology"
|
||||
}
|
||||
|
||||
func (m *Module) createOrganization(w http.ResponseWriter, r *http.Request) {
|
||||
var req createOrganizationRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid organization payload")
|
||||
return
|
||||
}
|
||||
if req.ActorUserID == "" || req.Name == "" || req.Slug == "" {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "actor_user_id, slug, and name are required")
|
||||
return
|
||||
}
|
||||
role, err := m.getPlatformRole(r.Context(), req.ActorUserID)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if !isPlatformAdmin(role) {
|
||||
httpx.WriteError(w, http.StatusForbidden, "platform admin role is required")
|
||||
return
|
||||
}
|
||||
if len(req.Metadata) == 0 {
|
||||
req.Metadata = json.RawMessage(`{}`)
|
||||
}
|
||||
if !json.Valid(req.Metadata) {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "metadata must be valid json")
|
||||
return
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
org := Organization{
|
||||
ID: uuid.NewString(),
|
||||
Slug: normalizeSlug(req.Slug),
|
||||
Name: req.Name,
|
||||
Status: "active",
|
||||
Metadata: req.Metadata,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
}
|
||||
membership := Membership{
|
||||
ID: uuid.NewString(),
|
||||
OrganizationID: org.ID,
|
||||
UserID: req.ActorUserID,
|
||||
RoleID: RoleOrgOwner,
|
||||
Status: "active",
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
}
|
||||
tx, err := m.db.Begin(r.Context())
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
defer tx.Rollback(r.Context())
|
||||
if _, err := tx.Exec(r.Context(), `
|
||||
INSERT INTO organizations (id, slug, name, status, metadata, created_at, updated_at)
|
||||
VALUES ($1, $2, $3, $4, $5::jsonb, $6, $7)
|
||||
`, org.ID, org.Slug, org.Name, org.Status, []byte(org.Metadata), org.CreatedAt, org.UpdatedAt); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if _, err := tx.Exec(r.Context(), `
|
||||
INSERT INTO organization_memberships (
|
||||
id, organization_id, user_id, role_id, status, invited_by_user_id, created_at, updated_at
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
`, membership.ID, membership.OrganizationID, membership.UserID, membership.RoleID, membership.Status, req.ActorUserID, membership.CreatedAt, membership.UpdatedAt); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if err := tx.Commit(r.Context()); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusCreated, map[string]any{
|
||||
"organization": org,
|
||||
"membership": membership,
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) listMemberships(w http.ResponseWriter, r *http.Request) {
|
||||
orgID := chi.URLParam(r, "organizationID")
|
||||
userID := r.URL.Query().Get("user_id")
|
||||
if userID == "" {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "user_id is required")
|
||||
return
|
||||
}
|
||||
if err := m.ensureOrgAccess(r.Context(), orgID, userID, true); err != nil {
|
||||
httpx.WriteError(w, http.StatusForbidden, err.Error())
|
||||
return
|
||||
}
|
||||
rows, err := m.db.Query(r.Context(), `
|
||||
SELECT id, organization_id, user_id, role_id, status, invited_by_user_id, created_at, updated_at
|
||||
FROM organization_memberships
|
||||
WHERE organization_id = $1
|
||||
ORDER BY created_at DESC
|
||||
`, orgID)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
var memberships []Membership
|
||||
for rows.Next() {
|
||||
membership, err := scanMembership(rows)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
memberships = append(memberships, membership)
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"memberships": memberships})
|
||||
}
|
||||
|
||||
func (m *Module) addMembership(w http.ResponseWriter, r *http.Request) {
|
||||
orgID := chi.URLParam(r, "organizationID")
|
||||
var req addMembershipRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid membership payload")
|
||||
return
|
||||
}
|
||||
if req.ActorUserID == "" || req.UserID == "" || req.RoleID == "" {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "actor_user_id, user_id, and role_id are required")
|
||||
return
|
||||
}
|
||||
if err := m.ensureOrgAccess(r.Context(), orgID, req.ActorUserID, true); err != nil {
|
||||
httpx.WriteError(w, http.StatusForbidden, err.Error())
|
||||
return
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
membership := Membership{
|
||||
ID: uuid.NewString(),
|
||||
OrganizationID: orgID,
|
||||
UserID: req.UserID,
|
||||
RoleID: req.RoleID,
|
||||
Status: "active",
|
||||
InvitedByUser: &req.ActorUserID,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
}
|
||||
if _, err := m.db.Exec(r.Context(), `
|
||||
INSERT INTO organization_memberships (
|
||||
id, organization_id, user_id, role_id, status, invited_by_user_id, created_at, updated_at
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
ON CONFLICT (organization_id, user_id) DO UPDATE SET
|
||||
role_id = EXCLUDED.role_id,
|
||||
status = 'active',
|
||||
invited_by_user_id = EXCLUDED.invited_by_user_id,
|
||||
updated_at = EXCLUDED.updated_at
|
||||
`, membership.ID, membership.OrganizationID, membership.UserID, membership.RoleID, membership.Status, membership.InvitedByUser, membership.CreatedAt, membership.UpdatedAt); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusCreated, map[string]any{"membership": membership})
|
||||
}
|
||||
|
||||
var errForbidden = errors.New("forbidden")
|
||||
|
||||
func (m *Module) ensureOrgAccess(ctx context.Context, orgID, userID string, adminRequired bool) error {
|
||||
role, err := m.getPlatformRole(ctx, userID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if isPlatformAdmin(role) {
|
||||
return nil
|
||||
}
|
||||
query := `
|
||||
SELECT role_id
|
||||
FROM organization_memberships
|
||||
WHERE organization_id = $1 AND user_id = $2 AND status = 'active'
|
||||
`
|
||||
var roleID string
|
||||
if err := m.db.QueryRow(ctx, query, orgID, userID).Scan(&roleID); err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
return errForbidden
|
||||
}
|
||||
return err
|
||||
}
|
||||
if adminRequired && roleID != RoleOrgOwner && roleID != RoleOrgAdmin {
|
||||
return errForbidden
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Module) getPlatformRole(ctx context.Context, userID string) (string, error) {
|
||||
return authority.EffectivePlatformRole(ctx, m.db, m.authority, userID)
|
||||
}
|
||||
|
||||
func isPlatformAdmin(role string) bool {
|
||||
return role == "platform_admin" || role == "platform_recovery_admin"
|
||||
}
|
||||
|
||||
func (m *Module) getOrganizationByID(ctx context.Context, orgID string) (Organization, error) {
|
||||
row := m.db.QueryRow(ctx, `
|
||||
SELECT id, slug, name, status, metadata, created_at, updated_at
|
||||
FROM organizations
|
||||
WHERE id = $1
|
||||
`, orgID)
|
||||
return scanOrganization(row)
|
||||
}
|
||||
|
||||
type rowScanner interface {
|
||||
Scan(dest ...any) error
|
||||
}
|
||||
|
||||
func scanOrganization(row rowScanner) (Organization, error) {
|
||||
var org Organization
|
||||
if err := row.Scan(&org.ID, &org.Slug, &org.Name, &org.Status, &org.Metadata, &org.CreatedAt, &org.UpdatedAt); err != nil {
|
||||
return Organization{}, err
|
||||
}
|
||||
if len(org.Metadata) == 0 {
|
||||
org.Metadata = json.RawMessage(`{}`)
|
||||
}
|
||||
return org, nil
|
||||
}
|
||||
|
||||
func scanMembership(row rowScanner) (Membership, error) {
|
||||
var membership Membership
|
||||
if err := row.Scan(
|
||||
&membership.ID,
|
||||
&membership.OrganizationID,
|
||||
&membership.UserID,
|
||||
&membership.RoleID,
|
||||
&membership.Status,
|
||||
&membership.InvitedByUser,
|
||||
&membership.CreatedAt,
|
||||
&membership.UpdatedAt,
|
||||
); err != nil {
|
||||
return Membership{}, err
|
||||
}
|
||||
return membership, nil
|
||||
}
|
||||
|
||||
func normalizeSlug(in string) string {
|
||||
return strings.ToLower(strings.TrimSpace(in))
|
||||
}
|
||||
@@ -0,0 +1,639 @@
|
||||
package resource
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
"github.com/google/uuid"
|
||||
"github.com/jackc/pgx/v5"
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/authority"
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/httpx"
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/module"
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/secrets"
|
||||
)
|
||||
|
||||
const (
|
||||
CertificateVerificationModeStrict = "strict"
|
||||
CertificateVerificationModeIgnore = "ignore"
|
||||
RenderQualityProfileLowBandwidth = "low_bandwidth"
|
||||
RenderQualityProfileBalanced = "balanced"
|
||||
RenderQualityProfileHighQuality = "high_quality"
|
||||
RenderQualityProfileTextPriority = "text_priority"
|
||||
ClipboardModeDisabled = "disabled"
|
||||
ClipboardModeClientToServer = "client_to_server"
|
||||
ClipboardModeServerToClient = "server_to_client"
|
||||
ClipboardModeBidirectional = "bidirectional"
|
||||
FileTransferModeDisabled = "disabled"
|
||||
FileTransferModeClientToServer = "client_to_server"
|
||||
FileTransferModeServerToClient = "server_to_client"
|
||||
FileTransferModeBidirectional = "bidirectional"
|
||||
)
|
||||
|
||||
type Module struct {
|
||||
db *pgxpool.Pool
|
||||
appEnv string
|
||||
secretStore *secrets.ResourceSecretStore
|
||||
authority *authority.Verifier
|
||||
}
|
||||
|
||||
type Resource struct {
|
||||
ID string `json:"id"`
|
||||
OrganizationID string `json:"organization_id"`
|
||||
Name string `json:"name"`
|
||||
Address string `json:"address"`
|
||||
Protocol string `json:"protocol"`
|
||||
SecretRef *string `json:"secret_ref,omitempty"`
|
||||
CertificateVerificationMode string `json:"certificate_verification_mode"`
|
||||
RenderQualityProfile string `json:"render_quality_profile"`
|
||||
ClipboardMode string `json:"clipboard_mode"`
|
||||
FileTransferMode string `json:"file_transfer_mode"`
|
||||
Metadata json.RawMessage `json:"metadata"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type upsertResourceRequest struct {
|
||||
ActorUserID string `json:"actor_user_id"`
|
||||
OrganizationID string `json:"organization_id"`
|
||||
Name string `json:"name"`
|
||||
Address string `json:"address"`
|
||||
Protocol string `json:"protocol"`
|
||||
SecretRef *string `json:"secret_ref"`
|
||||
CertificateVerificationMode string `json:"certificate_verification_mode"`
|
||||
RenderQualityProfile string `json:"render_quality_profile"`
|
||||
ClipboardMode string `json:"clipboard_mode"`
|
||||
FileTransferMode string `json:"file_transfer_mode"`
|
||||
Metadata json.RawMessage `json:"metadata"`
|
||||
}
|
||||
|
||||
type upsertResourceSecretRequest struct {
|
||||
ActorUserID string `json:"actor_user_id"`
|
||||
Payload json.RawMessage `json:"payload"`
|
||||
Metadata json.RawMessage `json:"metadata"`
|
||||
}
|
||||
|
||||
func NewModule(deps module.Dependencies, secretStores ...*secrets.ResourceSecretStore) *Module {
|
||||
var secretStore *secrets.ResourceSecretStore
|
||||
if len(secretStores) > 0 {
|
||||
secretStore = secretStores[0]
|
||||
}
|
||||
authorityVerifier, _ := authority.NewVerifier(deps.Config.Installation)
|
||||
return &Module{db: deps.Infra.DB, appEnv: deps.Config.App.Env, secretStore: secretStore, authority: authorityVerifier}
|
||||
}
|
||||
|
||||
func (m *Module) Name() string {
|
||||
return "resource"
|
||||
}
|
||||
|
||||
func (m *Module) RegisterRoutes(router chi.Router) {
|
||||
router.Route("/resources", func(r chi.Router) {
|
||||
r.Get("/", m.listResources)
|
||||
r.Post("/", m.createResource)
|
||||
r.Get("/{resourceID}", m.getResource)
|
||||
r.Put("/{resourceID}", m.updateResource)
|
||||
r.Put("/{resourceID}/secret", m.upsertResourceSecret)
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) listResources(w http.ResponseWriter, r *http.Request) {
|
||||
userID := r.URL.Query().Get("user_id")
|
||||
orgID := r.URL.Query().Get("organization_id")
|
||||
if userID == "" {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "user_id is required")
|
||||
return
|
||||
}
|
||||
platformRole, err := m.getPlatformRole(r.Context(), userID)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
query := `
|
||||
SELECT r.id, r.organization_id, r.name, r.address, r.protocol, r.secret_ref,
|
||||
r.certificate_verification_mode, r.metadata, r.created_at, r.updated_at,
|
||||
COALESCE(rp.clipboard_mode, 'disabled') AS clipboard_mode,
|
||||
COALESCE(rp.file_transfer_mode, 'disabled') AS file_transfer_mode
|
||||
FROM resources r
|
||||
LEFT JOIN resource_policies rp ON rp.resource_id = r.id
|
||||
`
|
||||
args := make([]any, 0, 2)
|
||||
if platformRole == "platform_admin" || platformRole == "platform_recovery_admin" {
|
||||
if orgID != "" {
|
||||
query += ` WHERE r.organization_id = $1`
|
||||
args = append(args, orgID)
|
||||
}
|
||||
query += ` ORDER BY r.created_at DESC`
|
||||
} else {
|
||||
query += `
|
||||
INNER JOIN organization_memberships om ON om.organization_id = r.organization_id
|
||||
WHERE om.user_id = $1 AND om.status = 'active'
|
||||
`
|
||||
args = append(args, userID)
|
||||
if orgID != "" {
|
||||
query += ` AND r.organization_id = $2`
|
||||
args = append(args, orgID)
|
||||
}
|
||||
query += ` ORDER BY r.created_at DESC`
|
||||
}
|
||||
rows, err := m.db.Query(r.Context(), query, args...)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
resources := make([]Resource, 0)
|
||||
for rows.Next() {
|
||||
resource, err := scanResource(rows)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
resources = append(resources, resource)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"resources": resources})
|
||||
}
|
||||
|
||||
func (m *Module) getResource(w http.ResponseWriter, r *http.Request) {
|
||||
userID := r.URL.Query().Get("user_id")
|
||||
if userID == "" {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "user_id is required")
|
||||
return
|
||||
}
|
||||
resource, err := m.getByID(r.Context(), chi.URLParam(r, "resourceID"))
|
||||
if err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
httpx.WriteError(w, http.StatusNotFound, "resource not found")
|
||||
return
|
||||
}
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
if err := m.ensureResourceAccess(r.Context(), resource.OrganizationID, userID, false); err != nil {
|
||||
httpx.WriteError(w, http.StatusForbidden, err.Error())
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"resource": resource})
|
||||
}
|
||||
|
||||
func (m *Module) createResource(w http.ResponseWriter, r *http.Request) {
|
||||
req, err := decodeUpsertRequest(r)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
if err := secrets.ValidateResourceSecretReadiness(req.Protocol, req.SecretRef, req.Metadata, m.appEnv); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
now := time.Now().UTC()
|
||||
resource := Resource{
|
||||
ID: uuid.NewString(),
|
||||
OrganizationID: req.OrganizationID,
|
||||
Name: req.Name,
|
||||
Address: req.Address,
|
||||
Protocol: req.Protocol,
|
||||
SecretRef: req.SecretRef,
|
||||
CertificateVerificationMode: req.CertificateVerificationMode,
|
||||
RenderQualityProfile: req.RenderQualityProfile,
|
||||
ClipboardMode: req.ClipboardMode,
|
||||
FileTransferMode: req.FileTransferMode,
|
||||
Metadata: req.Metadata,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
}
|
||||
if err := m.ensureResourceAccess(r.Context(), req.OrganizationID, req.ActorUserID, true); err != nil {
|
||||
httpx.WriteError(w, http.StatusForbidden, err.Error())
|
||||
return
|
||||
}
|
||||
tx, err := m.db.Begin(r.Context())
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
defer tx.Rollback(r.Context())
|
||||
if _, err := tx.Exec(r.Context(), `
|
||||
INSERT INTO resources (
|
||||
id, organization_id, name, address, protocol, secret_ref, certificate_verification_mode, metadata, created_at, updated_at
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8::jsonb, $9, $10)
|
||||
`, resource.ID, resource.OrganizationID, resource.Name, resource.Address, resource.Protocol, resource.SecretRef, resource.CertificateVerificationMode, []byte(resource.Metadata), resource.CreatedAt, resource.UpdatedAt); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if err := upsertResourcePolicy(r.Context(), tx, resource.ID, resource.ClipboardMode, resource.FileTransferMode, now); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if err := tx.Commit(r.Context()); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
httpx.WriteJSON(w, http.StatusCreated, map[string]any{"resource": resource})
|
||||
}
|
||||
|
||||
func (m *Module) updateResource(w http.ResponseWriter, r *http.Request) {
|
||||
req, err := decodeUpsertRequest(r)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
if err := secrets.ValidateResourceSecretReadiness(req.Protocol, req.SecretRef, req.Metadata, m.appEnv); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
resourceID := chi.URLParam(r, "resourceID")
|
||||
existing, err := m.getByID(r.Context(), resourceID)
|
||||
if err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
httpx.WriteError(w, http.StatusNotFound, "resource not found")
|
||||
return
|
||||
}
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if err := m.ensureResourceAccess(r.Context(), existing.OrganizationID, req.ActorUserID, true); err != nil {
|
||||
httpx.WriteError(w, http.StatusForbidden, err.Error())
|
||||
return
|
||||
}
|
||||
if req.OrganizationID != existing.OrganizationID {
|
||||
if err := m.ensureResourceAccess(r.Context(), req.OrganizationID, req.ActorUserID, true); err != nil {
|
||||
httpx.WriteError(w, http.StatusForbidden, err.Error())
|
||||
return
|
||||
}
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
tx, err := m.db.Begin(r.Context())
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
defer tx.Rollback(r.Context())
|
||||
tag, err := tx.Exec(r.Context(), `
|
||||
UPDATE resources
|
||||
SET
|
||||
organization_id = $2,
|
||||
name = $3,
|
||||
address = $4,
|
||||
protocol = $5,
|
||||
secret_ref = $6,
|
||||
certificate_verification_mode = $7,
|
||||
metadata = $8::jsonb,
|
||||
updated_at = $9
|
||||
WHERE id = $1
|
||||
`, resourceID, req.OrganizationID, req.Name, req.Address, req.Protocol, req.SecretRef, req.CertificateVerificationMode, []byte(req.Metadata), now)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if tag.RowsAffected() == 0 {
|
||||
httpx.WriteError(w, http.StatusNotFound, "resource not found")
|
||||
return
|
||||
}
|
||||
if err := upsertResourcePolicy(r.Context(), tx, resourceID, req.ClipboardMode, req.FileTransferMode, now); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if err := tx.Commit(r.Context()); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
resource, err := m.getByID(r.Context(), resourceID)
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"resource": resource})
|
||||
}
|
||||
|
||||
func (m *Module) upsertResourceSecret(w http.ResponseWriter, r *http.Request) {
|
||||
if m.secretStore == nil {
|
||||
httpx.WriteError(w, http.StatusServiceUnavailable, "resource secret encryption is not configured")
|
||||
return
|
||||
}
|
||||
var req upsertResourceSecretRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid resource secret payload")
|
||||
return
|
||||
}
|
||||
if req.ActorUserID == "" {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "actor_user_id is required")
|
||||
return
|
||||
}
|
||||
resourceID := chi.URLParam(r, "resourceID")
|
||||
resource, err := m.getByID(r.Context(), resourceID)
|
||||
if err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
httpx.WriteError(w, http.StatusNotFound, "resource not found")
|
||||
return
|
||||
}
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if err := m.ensureResourceAccess(r.Context(), resource.OrganizationID, req.ActorUserID, true); err != nil {
|
||||
httpx.WriteError(w, http.StatusForbidden, err.Error())
|
||||
return
|
||||
}
|
||||
tx, err := m.db.Begin(r.Context())
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
defer tx.Rollback(r.Context())
|
||||
secretStore := m.secretStore.WithDB(tx)
|
||||
secretRef := secrets.DefaultResourceSecretRef(resource.OrganizationID, resource.ID)
|
||||
descriptor, err := secretStore.Upsert(r.Context(), secrets.UpsertResourceSecretCommand{
|
||||
OrganizationID: resource.OrganizationID,
|
||||
ResourceID: resource.ID,
|
||||
Protocol: resource.Protocol,
|
||||
SecretRef: secretRef,
|
||||
Payload: req.Payload,
|
||||
Metadata: req.Metadata,
|
||||
ActorUserID: req.ActorUserID,
|
||||
})
|
||||
if err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
if _, err := tx.Exec(r.Context(), `
|
||||
UPDATE resources
|
||||
SET secret_ref = $2, updated_at = $3
|
||||
WHERE id = $1::uuid
|
||||
`, resource.ID, descriptor.SecretRef, time.Now().UTC()); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if err := writeAuditEvent(r.Context(), tx, "resource_secret_rotated", req.ActorUserID, "resource_secret", descriptor.SecretRef, map[string]any{
|
||||
"resource_id": resource.ID,
|
||||
"organization_id": resource.OrganizationID,
|
||||
"protocol": resource.Protocol,
|
||||
"version": descriptor.Version,
|
||||
"secret_ref": descriptor.SecretRef,
|
||||
}); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if err := tx.Commit(r.Context()); err != nil {
|
||||
httpx.WriteError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"secret": descriptor})
|
||||
}
|
||||
|
||||
func decodeUpsertRequest(r *http.Request) (*upsertResourceRequest, error) {
|
||||
var req upsertResourceRequest
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
return nil, errors.New("invalid resource payload")
|
||||
}
|
||||
if req.Name == "" {
|
||||
return nil, errors.New("name is required")
|
||||
}
|
||||
if req.ActorUserID == "" {
|
||||
return nil, errors.New("actor_user_id is required")
|
||||
}
|
||||
if req.OrganizationID == "" {
|
||||
return nil, errors.New("organization_id is required")
|
||||
}
|
||||
if req.Address == "" {
|
||||
return nil, errors.New("address is required")
|
||||
}
|
||||
if req.Protocol == "" {
|
||||
req.Protocol = "rdp"
|
||||
}
|
||||
mode, err := normalizeCertificateVerificationMode(req.CertificateVerificationMode)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.CertificateVerificationMode = mode
|
||||
renderQualityProfile, err := normalizeRenderQualityProfile(req.RenderQualityProfile)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.RenderQualityProfile = renderQualityProfile
|
||||
clipboardMode, err := normalizeClipboardMode(req.ClipboardMode)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.ClipboardMode = clipboardMode
|
||||
fileTransferMode, err := normalizeFileTransferMode(req.FileTransferMode)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.FileTransferMode = fileTransferMode
|
||||
metadata, err := normalizeMetadata(req.Metadata, req.CertificateVerificationMode, req.RenderQualityProfile)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Metadata = metadata
|
||||
return &req, nil
|
||||
}
|
||||
|
||||
func normalizeCertificateVerificationMode(mode string) (string, error) {
|
||||
switch mode {
|
||||
case "", CertificateVerificationModeStrict:
|
||||
return CertificateVerificationModeStrict, nil
|
||||
case CertificateVerificationModeIgnore:
|
||||
return CertificateVerificationModeIgnore, nil
|
||||
default:
|
||||
return "", errors.New("certificate_verification_mode must be one of: strict, ignore")
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeClipboardMode(mode string) (string, error) {
|
||||
switch mode {
|
||||
case "", ClipboardModeDisabled:
|
||||
return ClipboardModeDisabled, nil
|
||||
case ClipboardModeClientToServer, ClipboardModeServerToClient, ClipboardModeBidirectional:
|
||||
return mode, nil
|
||||
default:
|
||||
return "", errors.New("clipboard_mode must be one of: disabled, client_to_server, server_to_client, bidirectional")
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeFileTransferMode(mode string) (string, error) {
|
||||
switch mode {
|
||||
case "", FileTransferModeDisabled:
|
||||
return FileTransferModeDisabled, nil
|
||||
case FileTransferModeClientToServer, FileTransferModeServerToClient, FileTransferModeBidirectional:
|
||||
return mode, nil
|
||||
default:
|
||||
return "", errors.New("file_transfer_mode must be one of: disabled, client_to_server, server_to_client, bidirectional")
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeMetadata(raw json.RawMessage, certificateVerificationMode, renderQualityProfile string) (json.RawMessage, error) {
|
||||
if len(raw) == 0 {
|
||||
raw = json.RawMessage(`{}`)
|
||||
}
|
||||
if !json.Valid(raw) {
|
||||
return nil, errors.New("metadata must be valid json")
|
||||
}
|
||||
var metadata map[string]any
|
||||
if err := json.Unmarshal(raw, &metadata); err != nil {
|
||||
return nil, errors.New("metadata must be a json object")
|
||||
}
|
||||
metadata["certificate_verification_mode"] = certificateVerificationMode
|
||||
metadata["render_quality_profile"] = renderQualityProfile
|
||||
encoded, err := json.Marshal(metadata)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return json.RawMessage(encoded), nil
|
||||
}
|
||||
|
||||
func (m *Module) getByID(ctx context.Context, resourceID string) (Resource, error) {
|
||||
row := m.db.QueryRow(ctx, `
|
||||
SELECT r.id, r.organization_id, r.name, r.address, r.protocol, r.secret_ref,
|
||||
r.certificate_verification_mode, r.metadata, r.created_at, r.updated_at,
|
||||
COALESCE(rp.clipboard_mode, 'disabled') AS clipboard_mode,
|
||||
COALESCE(rp.file_transfer_mode, 'disabled') AS file_transfer_mode
|
||||
FROM resources r
|
||||
LEFT JOIN resource_policies rp ON rp.resource_id = r.id
|
||||
WHERE r.id = $1
|
||||
`, resourceID)
|
||||
return scanResource(row)
|
||||
}
|
||||
|
||||
func (m *Module) ensureResourceAccess(ctx context.Context, orgID, userID string, adminRequired bool) error {
|
||||
role, err := m.getPlatformRole(ctx, userID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if role == "platform_admin" || role == "platform_recovery_admin" {
|
||||
return nil
|
||||
}
|
||||
var membershipRole string
|
||||
if err := m.db.QueryRow(ctx, `
|
||||
SELECT role_id
|
||||
FROM organization_memberships
|
||||
WHERE organization_id = $1 AND user_id = $2 AND status = 'active'
|
||||
`, orgID, userID).Scan(&membershipRole); err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
return errors.New("forbidden")
|
||||
}
|
||||
return err
|
||||
}
|
||||
if adminRequired && membershipRole != "org_owner" && membershipRole != "org_admin" {
|
||||
return errors.New("forbidden")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Module) getPlatformRole(ctx context.Context, userID string) (string, error) {
|
||||
return authority.EffectivePlatformRole(ctx, m.db, m.authority, userID)
|
||||
}
|
||||
|
||||
type rowScanner interface {
|
||||
Scan(dest ...any) error
|
||||
}
|
||||
|
||||
func scanResource(row rowScanner) (Resource, error) {
|
||||
var resource Resource
|
||||
if err := row.Scan(
|
||||
&resource.ID,
|
||||
&resource.OrganizationID,
|
||||
&resource.Name,
|
||||
&resource.Address,
|
||||
&resource.Protocol,
|
||||
&resource.SecretRef,
|
||||
&resource.CertificateVerificationMode,
|
||||
&resource.Metadata,
|
||||
&resource.CreatedAt,
|
||||
&resource.UpdatedAt,
|
||||
&resource.ClipboardMode,
|
||||
&resource.FileTransferMode,
|
||||
); err != nil {
|
||||
return Resource{}, err
|
||||
}
|
||||
if len(resource.Metadata) == 0 {
|
||||
resource.Metadata = json.RawMessage(`{}`)
|
||||
}
|
||||
if resource.CertificateVerificationMode == "" {
|
||||
resource.CertificateVerificationMode = CertificateVerificationModeStrict
|
||||
}
|
||||
if resource.RenderQualityProfile == "" {
|
||||
resource.RenderQualityProfile = renderQualityProfileFromMetadata(resource.Metadata)
|
||||
}
|
||||
if resource.ClipboardMode == "" {
|
||||
resource.ClipboardMode = ClipboardModeDisabled
|
||||
}
|
||||
if resource.FileTransferMode == "" {
|
||||
resource.FileTransferMode = FileTransferModeDisabled
|
||||
}
|
||||
return resource, nil
|
||||
}
|
||||
|
||||
func upsertResourcePolicy(ctx context.Context, tx pgx.Tx, resourceID, clipboardMode, fileTransferMode string, now time.Time) error {
|
||||
clipboardEnabled := clipboardMode != ClipboardModeDisabled
|
||||
fileTransferEnabled := fileTransferMode == FileTransferModeClientToServer || fileTransferMode == FileTransferModeBidirectional
|
||||
_, err := tx.Exec(ctx, `
|
||||
INSERT INTO resource_policies (
|
||||
resource_id, clipboard_enabled, clipboard_mode, file_transfer_enabled, file_transfer_mode, created_at, updated_at
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $6)
|
||||
ON CONFLICT (resource_id) DO UPDATE SET
|
||||
clipboard_enabled = EXCLUDED.clipboard_enabled,
|
||||
clipboard_mode = EXCLUDED.clipboard_mode,
|
||||
file_transfer_enabled = EXCLUDED.file_transfer_enabled,
|
||||
file_transfer_mode = EXCLUDED.file_transfer_mode,
|
||||
updated_at = EXCLUDED.updated_at
|
||||
`, resourceID, clipboardEnabled, clipboardMode, fileTransferEnabled, fileTransferMode, now)
|
||||
return err
|
||||
}
|
||||
|
||||
func writeAuditEvent(ctx context.Context, tx pgx.Tx, eventType, actorUserID, targetType, targetID string, payload map[string]any) error {
|
||||
encoded, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = tx.Exec(ctx, `
|
||||
INSERT INTO audit_events (
|
||||
id, actor_user_id, event_type, target_type, target_id, payload, created_at
|
||||
) VALUES (
|
||||
$1::uuid, NULLIF($2, '')::uuid, $3, $4, $5, $6::jsonb, $7
|
||||
)
|
||||
`, uuid.NewString(), actorUserID, eventType, targetType, targetID, encoded, time.Now().UTC())
|
||||
return err
|
||||
}
|
||||
|
||||
func normalizeRenderQualityProfile(profile string) (string, error) {
|
||||
switch profile {
|
||||
case "", RenderQualityProfileBalanced:
|
||||
return RenderQualityProfileBalanced, nil
|
||||
case RenderQualityProfileLowBandwidth, RenderQualityProfileHighQuality, RenderQualityProfileTextPriority:
|
||||
return profile, nil
|
||||
default:
|
||||
return "", errors.New("render_quality_profile must be one of: low_bandwidth, balanced, high_quality, text_priority")
|
||||
}
|
||||
}
|
||||
|
||||
func renderQualityProfileFromMetadata(raw json.RawMessage) string {
|
||||
if len(raw) == 0 {
|
||||
return RenderQualityProfileBalanced
|
||||
}
|
||||
var metadata map[string]any
|
||||
if err := json.Unmarshal(raw, &metadata); err != nil {
|
||||
return RenderQualityProfileBalanced
|
||||
}
|
||||
if profile, ok := metadata["render_quality_profile"].(string); ok {
|
||||
switch profile {
|
||||
case RenderQualityProfileLowBandwidth, RenderQualityProfileBalanced, RenderQualityProfileHighQuality, RenderQualityProfileTextPriority:
|
||||
return profile
|
||||
}
|
||||
}
|
||||
return RenderQualityProfileBalanced
|
||||
}
|
||||
@@ -0,0 +1,219 @@
|
||||
package sessionbroker
|
||||
|
||||
import (
|
||||
"crypto/rsa"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/golang-jwt/jwt/v5"
|
||||
"github.com/google/uuid"
|
||||
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/secrets"
|
||||
sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session"
|
||||
)
|
||||
|
||||
const (
|
||||
directWorkerTLSTrustModeSmokeInsecure = "smoke_insecure"
|
||||
directWorkerTLSTrustModePublicCA = "public_ca"
|
||||
directWorkerTLSTrustModePlatformCA = "platform_ca"
|
||||
)
|
||||
|
||||
type DataPlaneTokenClaims struct {
|
||||
SessionID string `json:"session_id"`
|
||||
AttachmentID string `json:"attachment_id"`
|
||||
UserID string `json:"user_id"`
|
||||
OrganizationID string `json:"organization_id"`
|
||||
ClusterID string `json:"cluster_id,omitempty"`
|
||||
WorkerID string `json:"worker_id"`
|
||||
ResourceID string `json:"resource_id"`
|
||||
AllowedChannels []string `json:"allowed_channels"`
|
||||
ExpiresAtValue time.Time `json:"expires_at"`
|
||||
jwt.RegisteredClaims
|
||||
}
|
||||
|
||||
func (s *Service) buildDataPlaneOffer(session RemoteSession, attachment SessionAttachment) (*sessioncontracts.DataPlaneOffer, error) {
|
||||
if s.cfg.DataPlane.TokenTTL <= 0 || s.cfg.DataPlane.TokenPrivateKeyPEM == "" {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
now := s.now().UTC()
|
||||
expiresAt := now.Add(s.cfg.DataPlane.TokenTTL)
|
||||
allowedChannels := dataPlaneAllowedChannelsFromSession(session)
|
||||
jti := uuid.NewString()
|
||||
claims := DataPlaneTokenClaims{
|
||||
SessionID: session.ID,
|
||||
AttachmentID: attachment.ID,
|
||||
UserID: attachment.UserID,
|
||||
OrganizationID: session.OrganizationID,
|
||||
WorkerID: session.WorkerID,
|
||||
ResourceID: session.ResourceID,
|
||||
AllowedChannels: allowedChannels,
|
||||
ExpiresAtValue: expiresAt,
|
||||
RegisteredClaims: jwt.RegisteredClaims{
|
||||
ID: jti,
|
||||
Issuer: s.cfg.Auth.Issuer,
|
||||
Subject: attachment.UserID,
|
||||
Audience: jwt.ClaimStrings{"rap-data-plane", "worker:" + session.WorkerID},
|
||||
IssuedAt: jwt.NewNumericDate(now),
|
||||
NotBefore: jwt.NewNumericDate(now),
|
||||
ExpiresAt: jwt.NewNumericDate(expiresAt),
|
||||
},
|
||||
}
|
||||
token, err := signDataPlaneToken(claims, s.cfg.DataPlane.TokenPrivateKeyPEM)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
candidates := s.buildDataPlaneCandidates(session)
|
||||
preferred := sessioncontracts.DataPlaneCandidateBackendGateway
|
||||
if len(candidates) > 0 {
|
||||
preferred = candidates[0].Type
|
||||
}
|
||||
|
||||
return &sessioncontracts.DataPlaneOffer{
|
||||
Preferred: preferred,
|
||||
Token: token,
|
||||
ExpiresAt: expiresAt,
|
||||
Candidates: candidates,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *Service) buildDataPlaneCandidates(session RemoteSession) []sessioncontracts.DataPlaneCandidate {
|
||||
var candidates []sessioncontracts.DataPlaneCandidate
|
||||
if directURL := s.directWorkerWSSURL(session.WorkerID); directURL != "" && s.canAdvertiseDirectWorkerWSS() {
|
||||
metadata := map[string]any(nil)
|
||||
if s.cfg.DataPlane.DirectWorkerJSONRuntime {
|
||||
metadata = map[string]any{
|
||||
"runtime_transport": "json_v1",
|
||||
"traffic_ready": true,
|
||||
}
|
||||
s.addDirectWorkerTLSTrustMetadata(metadata)
|
||||
if s.cfg.DataPlane.DirectWorkerBinaryRender {
|
||||
metadata["render_transport"] = "binary_v1"
|
||||
metadata["binary_render"] = true
|
||||
metadata["supported_color_modes"] = []string{"full_color", "grayscale"}
|
||||
metadata["default_color_mode"] = "full_color"
|
||||
}
|
||||
}
|
||||
candidates = append(candidates, sessioncontracts.DataPlaneCandidate{
|
||||
Type: sessioncontracts.DataPlaneCandidateDirectWorkerWSS,
|
||||
URL: directURL,
|
||||
WorkerID: session.WorkerID,
|
||||
Priority: 10,
|
||||
Metadata: metadata,
|
||||
})
|
||||
}
|
||||
if s.cfg.DataPlane.BackendGatewayURL != "" {
|
||||
candidates = append(candidates, sessioncontracts.DataPlaneCandidate{
|
||||
Type: sessioncontracts.DataPlaneCandidateBackendGateway,
|
||||
URL: s.cfg.DataPlane.BackendGatewayURL,
|
||||
Priority: 100,
|
||||
})
|
||||
}
|
||||
return candidates
|
||||
}
|
||||
|
||||
func (s *Service) canAdvertiseDirectWorkerWSS() bool {
|
||||
trustMode := normalizeDirectWorkerTLSTrustMode(s.cfg.DataPlane.DirectWorkerTLSTrustMode)
|
||||
return !secrets.IsProductionEnv(s.cfg.App.Env) || directWorkerTLSTrustModeIsProductionTrusted(trustMode)
|
||||
}
|
||||
|
||||
func (s *Service) addDirectWorkerTLSTrustMetadata(metadata map[string]any) {
|
||||
trustMode := normalizeDirectWorkerTLSTrustMode(s.cfg.DataPlane.DirectWorkerTLSTrustMode)
|
||||
metadata["tls_trust_mode"] = trustMode
|
||||
metadata["production_trusted"] = directWorkerTLSTrustModeIsProductionTrusted(trustMode)
|
||||
metadata["smoke_only"] = trustMode == directWorkerTLSTrustModeSmokeInsecure
|
||||
if s.cfg.DataPlane.DirectWorkerTLSCARef != "" {
|
||||
metadata["tls_ca_ref"] = s.cfg.DataPlane.DirectWorkerTLSCARef
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeDirectWorkerTLSTrustMode(mode string) string {
|
||||
switch strings.ToLower(strings.TrimSpace(mode)) {
|
||||
case directWorkerTLSTrustModePublicCA:
|
||||
return directWorkerTLSTrustModePublicCA
|
||||
case directWorkerTLSTrustModePlatformCA:
|
||||
return directWorkerTLSTrustModePlatformCA
|
||||
default:
|
||||
return directWorkerTLSTrustModeSmokeInsecure
|
||||
}
|
||||
}
|
||||
|
||||
func directWorkerTLSTrustModeIsProductionTrusted(mode string) bool {
|
||||
return mode == directWorkerTLSTrustModePublicCA || mode == directWorkerTLSTrustModePlatformCA
|
||||
}
|
||||
|
||||
func (s *Service) directWorkerWSSURL(workerID string) string {
|
||||
template := strings.TrimSpace(s.cfg.DataPlane.DirectWorkerWSSURLTemplate)
|
||||
if template == "" || workerID == "" {
|
||||
return ""
|
||||
}
|
||||
return strings.ReplaceAll(template, "{worker_id}", workerID)
|
||||
}
|
||||
|
||||
func signDataPlaneToken(claims DataPlaneTokenClaims, privateKeyPEM string) (string, error) {
|
||||
privateKey, err := jwt.ParseRSAPrivateKeyFromPEM([]byte(privateKeyPEM))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("parse data-plane private key: %w", err)
|
||||
}
|
||||
token := jwt.NewWithClaims(jwt.SigningMethodRS256, claims)
|
||||
signed, err := token.SignedString(privateKey)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("sign data-plane token: %w", err)
|
||||
}
|
||||
return signed, nil
|
||||
}
|
||||
|
||||
func parseDataPlaneToken(tokenValue string, publicKey *rsa.PublicKey) (*DataPlaneTokenClaims, error) {
|
||||
claims := &DataPlaneTokenClaims{}
|
||||
token, err := jwt.ParseWithClaims(tokenValue, claims, func(token *jwt.Token) (any, error) {
|
||||
if token.Method != jwt.SigningMethodRS256 {
|
||||
return nil, fmt.Errorf("unexpected data-plane signing method: %s", token.Header["alg"])
|
||||
}
|
||||
return publicKey, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if !token.Valid {
|
||||
return nil, fmt.Errorf("data-plane token invalid")
|
||||
}
|
||||
return claims, nil
|
||||
}
|
||||
|
||||
func dataPlaneAllowedChannelsFromSession(session RemoteSession) []string {
|
||||
channels := []string{
|
||||
sessioncontracts.DataPlaneChannelControl,
|
||||
sessioncontracts.DataPlaneChannelInput,
|
||||
sessioncontracts.DataPlaneChannelRender,
|
||||
sessioncontracts.DataPlaneChannelTelemetry,
|
||||
}
|
||||
metadata := decodeJSONMap(session.Metadata)
|
||||
policy, _ := metadata["policy"].(map[string]any)
|
||||
if policy != nil {
|
||||
if mode, _ := policy["clipboard_mode"].(string); mode != "" && mode != string(ResourceClipboardModeDisabled) {
|
||||
channels = append(channels, sessioncontracts.DataPlaneChannelClipboard)
|
||||
}
|
||||
if mode, _ := policy["file_transfer_mode"].(string); fileTransferAllowsClientToServer(ResourceFileTransferMode(mode)) {
|
||||
channels = append(channels, sessioncontracts.DataPlaneChannelFileUpload)
|
||||
}
|
||||
if mode, _ := policy["file_transfer_mode"].(string); fileTransferAllowsServerToClient(ResourceFileTransferMode(mode)) {
|
||||
channels = append(channels, sessioncontracts.DataPlaneChannelFileDownload)
|
||||
}
|
||||
}
|
||||
return channels
|
||||
}
|
||||
|
||||
func (s *Service) attachDataPlaneOffer(result *SessionControlResult) error {
|
||||
if result == nil || result.Attachment == nil {
|
||||
return nil
|
||||
}
|
||||
result.GatewayURL = s.cfg.DataPlane.BackendGatewayURL
|
||||
offer, err := s.buildDataPlaneOffer(result.Session, *result.Attachment)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
result.DataPlane = offer
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,357 @@
|
||||
package sessionbroker
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"crypto/rsa"
|
||||
"crypto/x509"
|
||||
"encoding/json"
|
||||
"encoding/pem"
|
||||
"slices"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/config"
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/module"
|
||||
sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session"
|
||||
)
|
||||
|
||||
func TestDataPlaneTokenScopeValidation(t *testing.T) {
|
||||
now := time.Now().UTC().Truncate(time.Second)
|
||||
privateKeyPEM, publicKey := testRS256Key(t)
|
||||
service := &Service{
|
||||
cfg: module.Config{
|
||||
Auth: config.AuthConfig{
|
||||
Issuer: "rap-api-test",
|
||||
},
|
||||
DataPlane: config.DataPlaneConfig{
|
||||
TokenTTL: time.Minute,
|
||||
TokenPrivateKeyPEM: privateKeyPEM,
|
||||
BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws",
|
||||
},
|
||||
},
|
||||
now: func() time.Time { return now },
|
||||
}
|
||||
session := RemoteSession{
|
||||
ID: "session-1",
|
||||
OrganizationID: "org-1",
|
||||
ResourceID: "resource-1",
|
||||
WorkerID: "worker-1",
|
||||
Metadata: mustJSON(t, map[string]any{"policy": map[string]any{"clipboard_mode": "bidirectional", "file_transfer_mode": "client_to_server"}}),
|
||||
}
|
||||
attachment := SessionAttachment{
|
||||
ID: "attachment-1",
|
||||
UserID: "user-1",
|
||||
}
|
||||
|
||||
offer, err := service.buildDataPlaneOffer(session, attachment)
|
||||
if err != nil {
|
||||
t.Fatalf("buildDataPlaneOffer returned error: %v", err)
|
||||
}
|
||||
if offer == nil {
|
||||
t.Fatal("expected data-plane offer")
|
||||
}
|
||||
|
||||
claims, err := parseDataPlaneToken(offer.Token, publicKey)
|
||||
if err != nil {
|
||||
t.Fatalf("parseDataPlaneToken returned error: %v", err)
|
||||
}
|
||||
assertEqual(t, claims.SessionID, session.ID, "session_id")
|
||||
assertEqual(t, claims.AttachmentID, attachment.ID, "attachment_id")
|
||||
assertEqual(t, claims.UserID, attachment.UserID, "user_id")
|
||||
assertEqual(t, claims.OrganizationID, session.OrganizationID, "organization_id")
|
||||
assertEqual(t, claims.WorkerID, session.WorkerID, "worker_id")
|
||||
assertEqual(t, claims.ResourceID, session.ResourceID, "resource_id")
|
||||
if claims.ID == "" {
|
||||
t.Fatal("expected jti")
|
||||
}
|
||||
if claims.ExpiresAt == nil || !claims.ExpiresAt.Time.Equal(now.Add(time.Minute)) {
|
||||
t.Fatalf("unexpected expires_at: %v", claims.ExpiresAt)
|
||||
}
|
||||
if !claims.ExpiresAtValue.Equal(now.Add(time.Minute)) {
|
||||
t.Fatalf("unexpected expires_at claim value: %v", claims.ExpiresAtValue)
|
||||
}
|
||||
for _, channel := range []string{
|
||||
sessioncontracts.DataPlaneChannelControl,
|
||||
sessioncontracts.DataPlaneChannelInput,
|
||||
sessioncontracts.DataPlaneChannelRender,
|
||||
sessioncontracts.DataPlaneChannelTelemetry,
|
||||
sessioncontracts.DataPlaneChannelClipboard,
|
||||
sessioncontracts.DataPlaneChannelFileUpload,
|
||||
} {
|
||||
if !slices.Contains(claims.AllowedChannels, channel) {
|
||||
t.Fatalf("expected allowed channel %q in %v", channel, claims.AllowedChannels)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDataPlaneOfferResponseShapeCompatibility(t *testing.T) {
|
||||
now := time.Now().UTC().Truncate(time.Second)
|
||||
privateKeyPEM, _ := testRS256Key(t)
|
||||
service := &Service{
|
||||
cfg: module.Config{
|
||||
Auth: config.AuthConfig{Issuer: "rap-api-test"},
|
||||
DataPlane: config.DataPlaneConfig{
|
||||
TokenTTL: time.Minute,
|
||||
TokenPrivateKeyPEM: privateKeyPEM,
|
||||
BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws",
|
||||
DirectWorkerWSSURLTemplate: "wss://{worker_id}.worker.example.test/rap/v1/data-plane",
|
||||
DirectWorkerJSONRuntime: true,
|
||||
DirectWorkerTLSTrustMode: "smoke_insecure",
|
||||
},
|
||||
},
|
||||
now: func() time.Time { return now },
|
||||
}
|
||||
result := &SessionControlResult{
|
||||
Session: RemoteSession{
|
||||
ID: "session-1",
|
||||
OrganizationID: "org-1",
|
||||
ResourceID: "resource-1",
|
||||
WorkerID: "worker-1",
|
||||
Metadata: mustJSON(t, map[string]any{"policy": map[string]any{"clipboard_mode": "disabled", "file_transfer_mode": "disabled"}}),
|
||||
},
|
||||
Attachment: &SessionAttachment{ID: "attachment-1", UserID: "user-1"},
|
||||
AttachToken: &sessioncontracts.AttachTokenClaims{
|
||||
Token: "existing-attach-token",
|
||||
SessionID: "session-1",
|
||||
AttachmentID: "attachment-1",
|
||||
UserID: "user-1",
|
||||
WorkerID: "worker-1",
|
||||
ExpiresAt: now.Add(2 * time.Minute),
|
||||
},
|
||||
}
|
||||
|
||||
if err := service.attachDataPlaneOffer(result); err != nil {
|
||||
t.Fatalf("attachDataPlaneOffer returned error: %v", err)
|
||||
}
|
||||
payload, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal response: %v", err)
|
||||
}
|
||||
var decoded map[string]any
|
||||
if err := json.Unmarshal(payload, &decoded); err != nil {
|
||||
t.Fatalf("decode response: %v", err)
|
||||
}
|
||||
if decoded["session"] == nil || decoded["attachment"] == nil || decoded["attach_token"] == nil {
|
||||
t.Fatalf("response lost existing fields: %s", payload)
|
||||
}
|
||||
if decoded["data_plane"] == nil || decoded["gateway_url"] == nil {
|
||||
t.Fatalf("response missing data-plane fields: %s", payload)
|
||||
}
|
||||
if result.DataPlane == nil {
|
||||
t.Fatal("expected data-plane offer")
|
||||
}
|
||||
if result.DataPlane.Preferred != sessioncontracts.DataPlaneCandidateDirectWorkerWSS {
|
||||
t.Fatalf("unexpected preferred candidate: %s", result.DataPlane.Preferred)
|
||||
}
|
||||
if len(result.DataPlane.Candidates) != 2 {
|
||||
t.Fatalf("expected direct and fallback candidates, got %d", len(result.DataPlane.Candidates))
|
||||
}
|
||||
if result.DataPlane.Candidates[0].URL != "wss://worker-1.worker.example.test/rap/v1/data-plane" {
|
||||
t.Fatalf("unexpected direct candidate URL: %s", result.DataPlane.Candidates[0].URL)
|
||||
}
|
||||
if result.DataPlane.Candidates[0].Metadata["runtime_transport"] != "json_v1" {
|
||||
t.Fatalf("direct candidate is missing json_v1 runtime metadata: %#v", result.DataPlane.Candidates[0].Metadata)
|
||||
}
|
||||
if result.DataPlane.Candidates[0].Metadata["traffic_ready"] != true {
|
||||
t.Fatalf("direct candidate is missing traffic_ready metadata: %#v", result.DataPlane.Candidates[0].Metadata)
|
||||
}
|
||||
if result.DataPlane.Candidates[0].Metadata["smoke_only"] != true {
|
||||
t.Fatalf("direct candidate should be marked smoke-only by default: %#v", result.DataPlane.Candidates[0].Metadata)
|
||||
}
|
||||
if result.DataPlane.Candidates[0].Metadata["production_trusted"] != false {
|
||||
t.Fatalf("smoke direct candidate must not be production-trusted: %#v", result.DataPlane.Candidates[0].Metadata)
|
||||
}
|
||||
if !strings.Contains(result.DataPlane.Candidates[1].URL, "/api/v1/gateway/ws") {
|
||||
t.Fatalf("unexpected backend candidate URL: %s", result.DataPlane.Candidates[1].URL)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDataPlaneDirectCandidateMetadataRequiresRuntimeFlag(t *testing.T) {
|
||||
service := &Service{
|
||||
cfg: module.Config{
|
||||
DataPlane: config.DataPlaneConfig{
|
||||
BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws",
|
||||
DirectWorkerWSSURLTemplate: "wss://{worker_id}.worker.example.test/rap/v1/data-plane",
|
||||
DirectWorkerTLSTrustMode: "smoke_insecure",
|
||||
},
|
||||
},
|
||||
}
|
||||
candidates := service.buildDataPlaneCandidates(RemoteSession{WorkerID: "worker-1"})
|
||||
if len(candidates) != 2 {
|
||||
t.Fatalf("expected direct and fallback candidates, got %d", len(candidates))
|
||||
}
|
||||
if candidates[0].Metadata != nil {
|
||||
t.Fatalf("direct candidate must not advertise json_v1 before runtime flag is enabled: %#v", candidates[0].Metadata)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDataPlaneDirectCandidateAdvertisesBinaryRenderOnlyWhenEnabled(t *testing.T) {
|
||||
service := &Service{
|
||||
cfg: module.Config{
|
||||
DataPlane: config.DataPlaneConfig{
|
||||
BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws",
|
||||
DirectWorkerWSSURLTemplate: "wss://{worker_id}.worker.example.test/rap/v1/data-plane",
|
||||
DirectWorkerJSONRuntime: true,
|
||||
DirectWorkerBinaryRender: true,
|
||||
DirectWorkerTLSTrustMode: "platform_ca",
|
||||
DirectWorkerTLSCARef: "rap-platform-ca:v1",
|
||||
},
|
||||
},
|
||||
}
|
||||
candidates := service.buildDataPlaneCandidates(RemoteSession{WorkerID: "worker-1"})
|
||||
if len(candidates) != 2 {
|
||||
t.Fatalf("expected direct and fallback candidates, got %d", len(candidates))
|
||||
}
|
||||
if candidates[0].Metadata["render_transport"] != "binary_v1" {
|
||||
t.Fatalf("direct candidate is missing binary render metadata: %#v", candidates[0].Metadata)
|
||||
}
|
||||
if candidates[0].Metadata["binary_render"] != true {
|
||||
t.Fatalf("direct candidate is missing binary_render metadata: %#v", candidates[0].Metadata)
|
||||
}
|
||||
if candidates[0].Metadata["default_color_mode"] != "full_color" {
|
||||
t.Fatalf("direct candidate is missing default_color_mode metadata: %#v", candidates[0].Metadata)
|
||||
}
|
||||
if candidates[0].Metadata["production_trusted"] != true || candidates[0].Metadata["tls_trust_mode"] != "platform_ca" {
|
||||
t.Fatalf("direct candidate is missing production trust metadata: %#v", candidates[0].Metadata)
|
||||
}
|
||||
if candidates[0].Metadata["tls_ca_ref"] != "rap-platform-ca:v1" {
|
||||
t.Fatalf("direct candidate is missing tls_ca_ref metadata: %#v", candidates[0].Metadata)
|
||||
}
|
||||
modes, ok := candidates[0].Metadata["supported_color_modes"].([]string)
|
||||
if !ok || !slices.Contains(modes, "full_color") || !slices.Contains(modes, "grayscale") {
|
||||
t.Fatalf("direct candidate is missing supported_color_modes metadata: %#v", candidates[0].Metadata)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDataPlaneDirectCandidateOmittedInProductionWhenSmokeOnly(t *testing.T) {
|
||||
service := &Service{
|
||||
cfg: module.Config{
|
||||
App: config.AppConfig{Env: "production"},
|
||||
DataPlane: config.DataPlaneConfig{
|
||||
BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws",
|
||||
DirectWorkerWSSURLTemplate: "wss://{worker_id}.worker.example.test/rap/v1/data-plane",
|
||||
DirectWorkerJSONRuntime: true,
|
||||
DirectWorkerTLSTrustMode: "smoke_insecure",
|
||||
},
|
||||
},
|
||||
}
|
||||
candidates := service.buildDataPlaneCandidates(RemoteSession{WorkerID: "worker-1"})
|
||||
if len(candidates) != 1 {
|
||||
t.Fatalf("expected fallback-only candidates in production with smoke TLS, got %d", len(candidates))
|
||||
}
|
||||
if candidates[0].Type != sessioncontracts.DataPlaneCandidateBackendGateway {
|
||||
t.Fatalf("production must not advertise smoke-only direct candidate: %#v", candidates)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDataPlaneDirectCandidateAdvertisedInProductionWhenTrusted(t *testing.T) {
|
||||
service := &Service{
|
||||
cfg: module.Config{
|
||||
App: config.AppConfig{Env: "production"},
|
||||
DataPlane: config.DataPlaneConfig{
|
||||
BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws",
|
||||
DirectWorkerWSSURLTemplate: "wss://{worker_id}.worker.example.test/rap/v1/data-plane",
|
||||
DirectWorkerJSONRuntime: true,
|
||||
DirectWorkerTLSTrustMode: "public_ca",
|
||||
},
|
||||
},
|
||||
}
|
||||
candidates := service.buildDataPlaneCandidates(RemoteSession{WorkerID: "worker-1"})
|
||||
if len(candidates) != 2 {
|
||||
t.Fatalf("expected trusted direct and fallback candidates, got %d", len(candidates))
|
||||
}
|
||||
if candidates[0].Metadata["production_trusted"] != true || candidates[0].Metadata["tls_trust_mode"] != "public_ca" {
|
||||
t.Fatalf("trusted production direct candidate metadata mismatch: %#v", candidates[0].Metadata)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDataPlaneCandidatesFallbackOnlyWhenDirectTemplateMissing(t *testing.T) {
|
||||
service := &Service{
|
||||
cfg: module.Config{
|
||||
DataPlane: config.DataPlaneConfig{
|
||||
BackendGatewayURL: "wss://backend.example.test/api/v1/gateway/ws",
|
||||
},
|
||||
},
|
||||
}
|
||||
candidates := service.buildDataPlaneCandidates(RemoteSession{WorkerID: "worker-1"})
|
||||
if len(candidates) != 1 {
|
||||
t.Fatalf("expected fallback-only candidate list, got %d", len(candidates))
|
||||
}
|
||||
if candidates[0].Type != sessioncontracts.DataPlaneCandidateBackendGateway {
|
||||
t.Fatalf("unexpected candidate type: %s", candidates[0].Type)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDataPlaneAllowedChannelsRespectRuntimePolicy(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
policy map[string]any
|
||||
expected []string
|
||||
blocked []string
|
||||
}{
|
||||
{
|
||||
name: "disabled policies expose only control input render telemetry",
|
||||
policy: map[string]any{"clipboard_mode": "disabled", "file_transfer_mode": "disabled"},
|
||||
expected: []string{sessioncontracts.DataPlaneChannelControl, sessioncontracts.DataPlaneChannelInput, sessioncontracts.DataPlaneChannelRender, sessioncontracts.DataPlaneChannelTelemetry},
|
||||
blocked: []string{sessioncontracts.DataPlaneChannelClipboard, sessioncontracts.DataPlaneChannelFileUpload},
|
||||
},
|
||||
{
|
||||
name: "clipboard policy adds clipboard channel",
|
||||
policy: map[string]any{"clipboard_mode": "server_to_client", "file_transfer_mode": "disabled"},
|
||||
expected: []string{sessioncontracts.DataPlaneChannelClipboard},
|
||||
blocked: []string{sessioncontracts.DataPlaneChannelFileUpload},
|
||||
},
|
||||
{
|
||||
name: "client upload policy adds file upload channel",
|
||||
policy: map[string]any{"clipboard_mode": "disabled", "file_transfer_mode": "client_to_server"},
|
||||
expected: []string{sessioncontracts.DataPlaneChannelFileUpload},
|
||||
blocked: []string{sessioncontracts.DataPlaneChannelClipboard},
|
||||
},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
session := RemoteSession{Metadata: mustJSON(t, map[string]any{"policy": tc.policy})}
|
||||
channels := dataPlaneAllowedChannelsFromSession(session)
|
||||
for _, channel := range tc.expected {
|
||||
if !slices.Contains(channels, channel) {
|
||||
t.Fatalf("expected channel %q in %v", channel, channels)
|
||||
}
|
||||
}
|
||||
for _, channel := range tc.blocked {
|
||||
if slices.Contains(channels, channel) {
|
||||
t.Fatalf("did not expect channel %q in %v", channel, channels)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func mustJSON(t *testing.T, value any) []byte {
|
||||
t.Helper()
|
||||
payload, err := json.Marshal(value)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal test metadata: %v", err)
|
||||
}
|
||||
return payload
|
||||
}
|
||||
|
||||
func testRS256Key(t *testing.T) (string, *rsa.PublicKey) {
|
||||
t.Helper()
|
||||
privateKey, err := rsa.GenerateKey(rand.Reader, 2048)
|
||||
if err != nil {
|
||||
t.Fatalf("generate RSA key: %v", err)
|
||||
}
|
||||
encoded := pem.EncodeToMemory(&pem.Block{
|
||||
Type: "RSA PRIVATE KEY",
|
||||
Bytes: x509.MarshalPKCS1PrivateKey(privateKey),
|
||||
})
|
||||
return string(encoded), &privateKey.PublicKey
|
||||
}
|
||||
|
||||
func assertEqual(t *testing.T, got, want, name string) {
|
||||
t.Helper()
|
||||
if got != want {
|
||||
t.Fatalf("unexpected %s: got %q want %q", name, got, want)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
package sessionbroker
|
||||
|
||||
import "errors"
|
||||
|
||||
var (
|
||||
ErrSessionNotFound = errors.New("remote session not found")
|
||||
ErrAttachmentNotFound = errors.New("session attachment not found")
|
||||
ErrActiveControllerPresent = errors.New("active controller already present")
|
||||
ErrTakeoverNotAllowed = errors.New("takeover not allowed")
|
||||
ErrTrustedDeviceRequired = errors.New("trusted device required")
|
||||
ErrAccessDenied = errors.New("access denied")
|
||||
ErrSessionNotAttachable = errors.New("session is not attachable")
|
||||
ErrSessionNotTerminable = errors.New("session is not terminable")
|
||||
ErrAttachTokenInvalid = errors.New("attach token invalid or expired")
|
||||
)
|
||||
@@ -0,0 +1,65 @@
|
||||
package sessionbroker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session"
|
||||
)
|
||||
|
||||
type LiveStateStore interface {
|
||||
UpsertSession(ctx context.Context, state LiveSessionState) error
|
||||
GetSession(ctx context.Context, sessionID string) (*LiveSessionState, error)
|
||||
DeleteSession(ctx context.Context, sessionID string) error
|
||||
BindController(ctx context.Context, binding sessioncontracts.ControllerBinding, ttl time.Duration) error
|
||||
GetControllerBinding(ctx context.Context, sessionID string) (*sessioncontracts.ControllerBinding, error)
|
||||
ClearControllerBinding(ctx context.Context, sessionID string) error
|
||||
StoreAttachToken(ctx context.Context, claims sessioncontracts.AttachTokenClaims, ttl time.Duration) error
|
||||
ConsumeAttachToken(ctx context.Context, token string) (*sessioncontracts.AttachTokenClaims, error)
|
||||
TouchAttachmentHeartbeat(ctx context.Context, sessionID, attachmentID string, ttl time.Duration) error
|
||||
UpdateWorkerRoute(ctx context.Context, route WorkerRoute, ttl time.Duration) error
|
||||
GetWorkerRoute(ctx context.Context, sessionID string) (*WorkerRoute, error)
|
||||
DeleteWorkerRoute(ctx context.Context, sessionID string) error
|
||||
}
|
||||
|
||||
type LiveSessionState struct {
|
||||
SessionID string `json:"session_id"`
|
||||
ResourceID string `json:"resource_id"`
|
||||
WorkerID string `json:"worker_id"`
|
||||
State sessioncontracts.State `json:"state"`
|
||||
ControllerID string `json:"controller_id"`
|
||||
AttachmentID string `json:"attachment_id"`
|
||||
TakeoverVersion int `json:"takeover_version"`
|
||||
RenderQualityProfile string `json:"render_quality_profile,omitempty"`
|
||||
RenderState string `json:"render_state,omitempty"`
|
||||
RenderWidth int `json:"render_width,omitempty"`
|
||||
RenderHeight int `json:"render_height,omitempty"`
|
||||
RenderFrameSequence int64 `json:"render_frame_sequence,omitempty"`
|
||||
RenderFrameFormat string `json:"render_frame_format,omitempty"`
|
||||
RenderFrameData string `json:"render_frame_data,omitempty"`
|
||||
LastInputCorrelationID string `json:"last_input_correlation_id,omitempty"`
|
||||
WorkerFrameCapturedAt string `json:"worker_frame_captured_at,omitempty"`
|
||||
CursorX int `json:"cursor_x,omitempty"`
|
||||
CursorY int `json:"cursor_y,omitempty"`
|
||||
CursorVisible bool `json:"cursor_visible,omitempty"`
|
||||
DirtyRectangles int `json:"dirty_rectangles,omitempty"`
|
||||
LastRenderAt *time.Time `json:"last_render_at,omitempty"`
|
||||
ClipboardSequence int64 `json:"clipboard_sequence,omitempty"`
|
||||
ClipboardText string `json:"clipboard_text,omitempty"`
|
||||
ClipboardOrigin string `json:"clipboard_origin,omitempty"`
|
||||
ClipboardContentHash string `json:"clipboard_content_hash,omitempty"`
|
||||
ClipboardUpdatedAt *time.Time `json:"clipboard_updated_at,omitempty"`
|
||||
FileDownloadSequence int64 `json:"file_download_sequence,omitempty"`
|
||||
FileDownloadType string `json:"file_download_type,omitempty"`
|
||||
FileDownloadPayload map[string]any `json:"file_download_payload,omitempty"`
|
||||
FileDownloadUpdatedAt *time.Time `json:"file_download_updated_at,omitempty"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
type WorkerRoute struct {
|
||||
SessionID string `json:"session_id"`
|
||||
WorkerID string `json:"worker_id"`
|
||||
LeaseID string `json:"lease_id"`
|
||||
ControlStream string `json:"control_stream"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
@@ -0,0 +1,132 @@
|
||||
package sessionbroker
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session"
|
||||
)
|
||||
|
||||
type AttachmentRole string
|
||||
|
||||
const (
|
||||
AttachmentRoleController AttachmentRole = "controller"
|
||||
)
|
||||
|
||||
type AttachmentState string
|
||||
|
||||
const (
|
||||
AttachmentStateAttaching AttachmentState = "attaching"
|
||||
AttachmentStateActive AttachmentState = "active"
|
||||
AttachmentStateDetached AttachmentState = "detached"
|
||||
AttachmentStateSuperseded AttachmentState = "superseded"
|
||||
AttachmentStateRevoked AttachmentState = "revoked"
|
||||
AttachmentStateClosed AttachmentState = "closed"
|
||||
)
|
||||
|
||||
type ResourceTakeoverPolicy string
|
||||
|
||||
const (
|
||||
ResourceTakeoverPolicyTrustedDevice ResourceTakeoverPolicy = "trusted_device"
|
||||
ResourceTakeoverPolicySameUser ResourceTakeoverPolicy = "same_user"
|
||||
ResourceTakeoverPolicyAdminOnly ResourceTakeoverPolicy = "admin_only"
|
||||
)
|
||||
|
||||
type ResourceClipboardMode string
|
||||
|
||||
const (
|
||||
ResourceClipboardModeDisabled ResourceClipboardMode = "disabled"
|
||||
ResourceClipboardModeClientToServer ResourceClipboardMode = "client_to_server"
|
||||
ResourceClipboardModeServerToClient ResourceClipboardMode = "server_to_client"
|
||||
ResourceClipboardModeBidirectional ResourceClipboardMode = "bidirectional"
|
||||
)
|
||||
|
||||
type ResourceFileTransferMode string
|
||||
|
||||
const (
|
||||
ResourceFileTransferModeDisabled ResourceFileTransferMode = "disabled"
|
||||
ResourceFileTransferModeClientToServer ResourceFileTransferMode = "client_to_server"
|
||||
ResourceFileTransferModeServerToClient ResourceFileTransferMode = "server_to_client"
|
||||
ResourceFileTransferModeBidirectional ResourceFileTransferMode = "bidirectional"
|
||||
)
|
||||
|
||||
type RemoteSession struct {
|
||||
ID string
|
||||
OrganizationID string
|
||||
ResourceID string
|
||||
Protocol string
|
||||
State sessioncontracts.State
|
||||
WorkerID string
|
||||
ControllerUserID string
|
||||
DetachDeadlineAt *time.Time
|
||||
LastHeartbeatAt *time.Time
|
||||
TakeoverVersion int
|
||||
RenderQualityProfile string
|
||||
Metadata []byte
|
||||
CreatedAt time.Time
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type SessionAttachment struct {
|
||||
ID string
|
||||
RemoteSessionID string
|
||||
UserID string
|
||||
DeviceID string
|
||||
Role AttachmentRole
|
||||
State AttachmentState
|
||||
SupersededBy *string
|
||||
TakeoverOf *string
|
||||
AttachedAt *time.Time
|
||||
DetachedAt *time.Time
|
||||
LastInputAt *time.Time
|
||||
Metadata []byte
|
||||
CreatedAt time.Time
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type ResourcePolicy struct {
|
||||
ResourceID string
|
||||
MaxConcurrentSessions int
|
||||
TakeoverPolicy ResourceTakeoverPolicy
|
||||
RequireTrustedDevice bool
|
||||
DetachGracePeriod time.Duration
|
||||
ClipboardEnabled bool
|
||||
ClipboardMode ResourceClipboardMode
|
||||
FileTransferEnabled bool
|
||||
FileTransferMode ResourceFileTransferMode
|
||||
CreatedAt time.Time
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type ResourceRuntimeSpec struct {
|
||||
ID string
|
||||
OrganizationID string
|
||||
Name string
|
||||
Address string
|
||||
Protocol string
|
||||
SecretRef *string
|
||||
CertificateVerificationMode string
|
||||
Metadata []byte
|
||||
}
|
||||
|
||||
type AuditEvent struct {
|
||||
ID string
|
||||
ActorUserID *string
|
||||
ActorDeviceID *string
|
||||
EventType string
|
||||
TargetType string
|
||||
TargetID string
|
||||
RemoteSessionID *string
|
||||
Payload []byte
|
||||
CreatedAt time.Time
|
||||
}
|
||||
|
||||
const (
|
||||
AuditEventSessionStarted = "session_started"
|
||||
AuditEventSessionAttached = "session_attached"
|
||||
AuditEventSessionDetached = "session_detached"
|
||||
AuditEventSessionTakenOver = "session_taken_over"
|
||||
AuditEventSessionTerminated = "session_terminated"
|
||||
AuditEventSessionFailed = "session_failed"
|
||||
AuditEventSecretAccessed = "resource_secret_accessed"
|
||||
AuditEventSecretAccessDenied = "resource_secret_access_denied"
|
||||
)
|
||||
@@ -0,0 +1,164 @@
|
||||
package sessionbroker
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
|
||||
"github.com/go-chi/chi/v5"
|
||||
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/httpx"
|
||||
)
|
||||
|
||||
type Module struct {
|
||||
service *Service
|
||||
}
|
||||
|
||||
func NewModule(service *Service) *Module {
|
||||
return &Module{service: service}
|
||||
}
|
||||
|
||||
func (m *Module) Name() string {
|
||||
return "session-broker"
|
||||
}
|
||||
|
||||
func (m *Module) Service() *Service {
|
||||
return m.service
|
||||
}
|
||||
|
||||
func (m *Module) RegisterRoutes(router chi.Router) {
|
||||
router.Route("/sessions", func(r chi.Router) {
|
||||
r.Get("/", m.listSessions)
|
||||
r.Post("/", m.startSession)
|
||||
r.Post("/{sessionID}/attach", m.attachSession)
|
||||
r.Post("/{sessionID}/detach", m.detachSession)
|
||||
r.Post("/{sessionID}/takeover", m.takeoverSession)
|
||||
r.Post("/{sessionID}/terminate", m.terminateSession)
|
||||
r.Post("/{sessionID}/fail", m.markFailed)
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) listSessions(w http.ResponseWriter, r *http.Request) {
|
||||
userID := r.URL.Query().Get("user_id")
|
||||
if userID == "" {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "user_id is required")
|
||||
return
|
||||
}
|
||||
sessions, err := m.service.ListSessions(r.Context(), userID)
|
||||
if err != nil {
|
||||
status, message := m.service.MapError(err)
|
||||
httpx.WriteError(w, status, message)
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, map[string]any{"sessions": sessions})
|
||||
}
|
||||
|
||||
func (m *Module) startSession(w http.ResponseWriter, r *http.Request) {
|
||||
var cmd StartRemoteSessionCommand
|
||||
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid start session payload")
|
||||
return
|
||||
}
|
||||
result, err := m.service.StartRemoteSession(r.Context(), cmd)
|
||||
if err != nil {
|
||||
status, message := m.service.MapError(err)
|
||||
httpx.WriteError(w, status, message)
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusCreated, result)
|
||||
}
|
||||
|
||||
func (m *Module) attachSession(w http.ResponseWriter, r *http.Request) {
|
||||
var cmd AttachToSessionCommand
|
||||
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid attach session payload")
|
||||
return
|
||||
}
|
||||
cmd.SessionID = chi.URLParam(r, "sessionID")
|
||||
result, err := m.service.AttachToSession(r.Context(), cmd)
|
||||
if err != nil {
|
||||
status, message := m.service.MapError(err)
|
||||
httpx.WriteError(w, status, message)
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, result)
|
||||
}
|
||||
|
||||
func (m *Module) detachSession(w http.ResponseWriter, r *http.Request) {
|
||||
var cmd DetachFromSessionCommand
|
||||
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid detach session payload")
|
||||
return
|
||||
}
|
||||
cmd.SessionID = chi.URLParam(r, "sessionID")
|
||||
result, err := m.service.DetachFromSession(r.Context(), cmd)
|
||||
if err != nil {
|
||||
status, message := m.service.MapError(err)
|
||||
httpx.WriteError(w, status, message)
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusAccepted, result)
|
||||
}
|
||||
|
||||
func (m *Module) takeoverSession(w http.ResponseWriter, r *http.Request) {
|
||||
var cmd TakeoverSessionCommand
|
||||
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid takeover session payload")
|
||||
return
|
||||
}
|
||||
cmd.SessionID = chi.URLParam(r, "sessionID")
|
||||
result, err := m.service.TakeoverSession(r.Context(), cmd)
|
||||
if err != nil {
|
||||
status, message := m.service.MapError(err)
|
||||
httpx.WriteError(w, status, message)
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusOK, result)
|
||||
}
|
||||
|
||||
func (m *Module) terminateSession(w http.ResponseWriter, r *http.Request) {
|
||||
var cmd TerminateSessionCommand
|
||||
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid terminate session payload")
|
||||
return
|
||||
}
|
||||
cmd.SessionID = chi.URLParam(r, "sessionID")
|
||||
if err := m.service.TerminateSession(r.Context(), cmd); err != nil {
|
||||
status, message := m.service.MapError(err)
|
||||
httpx.WriteError(w, status, message)
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{
|
||||
"status": "terminated",
|
||||
"message": httpx.NewMessage(
|
||||
"session.terminated",
|
||||
"status.session.terminated",
|
||||
"Session terminated.",
|
||||
nil,
|
||||
"",
|
||||
),
|
||||
})
|
||||
}
|
||||
|
||||
func (m *Module) markFailed(w http.ResponseWriter, r *http.Request) {
|
||||
var cmd MarkSessionFailedCommand
|
||||
if err := json.NewDecoder(r.Body).Decode(&cmd); err != nil {
|
||||
httpx.WriteError(w, http.StatusBadRequest, "invalid fail session payload")
|
||||
return
|
||||
}
|
||||
cmd.SessionID = chi.URLParam(r, "sessionID")
|
||||
if err := m.service.MarkSessionFailed(r.Context(), cmd); err != nil {
|
||||
status, message := m.service.MapError(err)
|
||||
httpx.WriteError(w, status, message)
|
||||
return
|
||||
}
|
||||
httpx.WriteJSON(w, http.StatusAccepted, map[string]any{
|
||||
"status": "failed",
|
||||
"message": httpx.NewMessage(
|
||||
"session.failed",
|
||||
"status.session.failed",
|
||||
"Session marked as failed.",
|
||||
nil,
|
||||
"",
|
||||
),
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
package sessionbroker
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
workercontracts "github.com/example/remote-access-platform/backend/pkg/contracts/worker"
|
||||
)
|
||||
|
||||
type WorkerOrchestrator interface {
|
||||
Reserve(ctx context.Context, request workercontracts.AttachRequest) (*workercontracts.WorkerLease, error)
|
||||
GetSessionLease(ctx context.Context, sessionID string) (*workercontracts.WorkerLease, error)
|
||||
ReleaseSessionLease(ctx context.Context, sessionID string) error
|
||||
PrepareAttachment(ctx context.Context, session RemoteSession, attachment SessionAttachment, runtimeMetadata map[string]any) error
|
||||
NotifyDetachment(ctx context.Context, session RemoteSession, attachment SessionAttachment) error
|
||||
TerminateRemoteSession(ctx context.Context, sessionID, attachmentID string) error
|
||||
ValidateSessionRuntime(ctx context.Context, sessionID, workerID string) (bool, string, error)
|
||||
}
|
||||
@@ -0,0 +1,607 @@
|
||||
package sessionbroker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/jackc/pgx/v5"
|
||||
"github.com/jackc/pgx/v5/pgxpool"
|
||||
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/authority"
|
||||
postgresplatform "github.com/example/remote-access-platform/backend/internal/platform/postgres"
|
||||
)
|
||||
|
||||
type postgresStore struct {
|
||||
db postgresplatform.DBTX
|
||||
authority *authority.Verifier
|
||||
}
|
||||
|
||||
type PostgresTransactor struct {
|
||||
pool *pgxpool.Pool
|
||||
authority *authority.Verifier
|
||||
}
|
||||
|
||||
func NewPostgresStore(pool *pgxpool.Pool, verifiers ...*authority.Verifier) Store {
|
||||
var authorityVerifier *authority.Verifier
|
||||
if len(verifiers) > 0 {
|
||||
authorityVerifier = verifiers[0]
|
||||
}
|
||||
return &postgresStore{db: pool, authority: authorityVerifier}
|
||||
}
|
||||
|
||||
func NewPostgresTransactor(pool *pgxpool.Pool, verifiers ...*authority.Verifier) *PostgresTransactor {
|
||||
var authorityVerifier *authority.Verifier
|
||||
if len(verifiers) > 0 {
|
||||
authorityVerifier = verifiers[0]
|
||||
}
|
||||
return &PostgresTransactor{pool: pool, authority: authorityVerifier}
|
||||
}
|
||||
|
||||
func (t *PostgresTransactor) WithinTransaction(ctx context.Context, fn func(store Store) error) error {
|
||||
return postgresplatform.WithTransaction(ctx, t.pool, func(tx pgx.Tx) error {
|
||||
return fn(&postgresStore{db: tx, authority: t.authority})
|
||||
})
|
||||
}
|
||||
|
||||
func (s *postgresStore) RemoteSessions() RemoteSessionRepository {
|
||||
return &postgresRemoteSessionRepository{db: s.db}
|
||||
}
|
||||
|
||||
func (s *postgresStore) SessionAttachments() SessionAttachmentRepository {
|
||||
return &postgresSessionAttachmentRepository{db: s.db}
|
||||
}
|
||||
|
||||
func (s *postgresStore) ResourcePolicies() ResourcePolicyRepository {
|
||||
return &postgresResourcePolicyRepository{db: s.db}
|
||||
}
|
||||
|
||||
func (s *postgresStore) ResourceRuntime() ResourceRuntimeRepository {
|
||||
return &postgresResourceRuntimeRepository{db: s.db}
|
||||
}
|
||||
|
||||
func (s *postgresStore) AuditEvents() AuditEventRepository {
|
||||
return &postgresAuditEventRepository{db: s.db}
|
||||
}
|
||||
|
||||
func (s *postgresStore) Access() AccessRepository {
|
||||
return &postgresAccessRepository{db: s.db, authority: s.authority}
|
||||
}
|
||||
|
||||
type postgresRemoteSessionRepository struct {
|
||||
db postgresplatform.DBTX
|
||||
}
|
||||
|
||||
type postgresSessionAttachmentRepository struct {
|
||||
db postgresplatform.DBTX
|
||||
}
|
||||
|
||||
type postgresResourcePolicyRepository struct {
|
||||
db postgresplatform.DBTX
|
||||
}
|
||||
|
||||
type postgresResourceRuntimeRepository struct {
|
||||
db postgresplatform.DBTX
|
||||
}
|
||||
|
||||
type postgresAuditEventRepository struct {
|
||||
db postgresplatform.DBTX
|
||||
}
|
||||
|
||||
type postgresAccessRepository struct {
|
||||
db postgresplatform.DBTX
|
||||
authority *authority.Verifier
|
||||
}
|
||||
|
||||
func (r *postgresRemoteSessionRepository) Create(ctx context.Context, session RemoteSession) error {
|
||||
const query = `
|
||||
INSERT INTO remote_sessions (
|
||||
id, organization_id, resource_id, protocol, state, worker_id, controller_user_id, detach_deadline_at,
|
||||
last_heartbeat_at, takeover_version, metadata, created_at, updated_at
|
||||
) VALUES (
|
||||
$1::uuid, $2::uuid, $3::uuid, $4, $5, NULLIF($6, ''), $7::uuid, $8, $9, $10, $11::jsonb, $12, $13
|
||||
)
|
||||
`
|
||||
if _, err := r.db.Exec(ctx, query,
|
||||
session.ID,
|
||||
session.OrganizationID,
|
||||
session.ResourceID,
|
||||
session.Protocol,
|
||||
session.State,
|
||||
session.WorkerID,
|
||||
session.ControllerUserID,
|
||||
session.DetachDeadlineAt,
|
||||
session.LastHeartbeatAt,
|
||||
session.TakeoverVersion,
|
||||
jsonPayload(session.Metadata),
|
||||
session.CreatedAt,
|
||||
session.UpdatedAt,
|
||||
); err != nil {
|
||||
return fmt.Errorf("create remote session: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *postgresRemoteSessionRepository) GetByID(ctx context.Context, sessionID string) (*RemoteSession, error) {
|
||||
return r.getByID(ctx, sessionID, "")
|
||||
}
|
||||
|
||||
func (r *postgresRemoteSessionRepository) GetByIDForUpdate(ctx context.Context, sessionID string) (*RemoteSession, error) {
|
||||
return r.getByID(ctx, sessionID, " FOR UPDATE")
|
||||
}
|
||||
|
||||
func (r *postgresRemoteSessionRepository) getByID(ctx context.Context, sessionID string, suffix string) (*RemoteSession, error) {
|
||||
query := `
|
||||
SELECT id::text, organization_id::text, resource_id::text, protocol, state, COALESCE(worker_id, ''), controller_user_id::text,
|
||||
detach_deadline_at, last_heartbeat_at, takeover_version, metadata, created_at, updated_at
|
||||
FROM remote_sessions
|
||||
WHERE id = $1::uuid` + suffix
|
||||
remoteSession, err := scanRemoteSession(r.db.QueryRow(ctx, query, sessionID))
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
return nil, nil
|
||||
}
|
||||
return remoteSession, err
|
||||
}
|
||||
|
||||
func (r *postgresRemoteSessionRepository) ListByController(ctx context.Context, userID string) ([]RemoteSession, error) {
|
||||
const query = `
|
||||
SELECT id::text, organization_id::text, resource_id::text, protocol, state, COALESCE(worker_id, ''), controller_user_id::text,
|
||||
detach_deadline_at, last_heartbeat_at, takeover_version, metadata, created_at, updated_at
|
||||
FROM remote_sessions
|
||||
WHERE controller_user_id = $1::uuid
|
||||
ORDER BY updated_at DESC
|
||||
`
|
||||
rows, err := r.db.Query(ctx, query, userID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list remote sessions: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var sessions []RemoteSession
|
||||
for rows.Next() {
|
||||
item, err := scanRemoteSession(rows)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
sessions = append(sessions, *item)
|
||||
}
|
||||
return sessions, rows.Err()
|
||||
}
|
||||
|
||||
func (r *postgresRemoteSessionRepository) CountLiveByResource(ctx context.Context, resourceID string) (int, error) {
|
||||
const query = `
|
||||
SELECT COUNT(*)
|
||||
FROM remote_sessions
|
||||
WHERE resource_id = $1::uuid AND state IN ('starting', 'active', 'detached', 'reconnecting')
|
||||
`
|
||||
var count int
|
||||
if err := r.db.QueryRow(ctx, query, resourceID).Scan(&count); err != nil {
|
||||
return 0, fmt.Errorf("count live remote sessions: %w", err)
|
||||
}
|
||||
return count, nil
|
||||
}
|
||||
|
||||
func (r *postgresRemoteSessionRepository) ListDetachedExpired(ctx context.Context, before time.Time, limit int) ([]RemoteSession, error) {
|
||||
const query = `
|
||||
SELECT id::text, organization_id::text, resource_id::text, protocol, state, COALESCE(worker_id, ''), controller_user_id::text,
|
||||
detach_deadline_at, last_heartbeat_at, takeover_version, metadata, created_at, updated_at
|
||||
FROM remote_sessions
|
||||
WHERE state = 'detached' AND detach_deadline_at IS NOT NULL AND detach_deadline_at <= $1
|
||||
ORDER BY detach_deadline_at ASC
|
||||
LIMIT $2
|
||||
`
|
||||
rows, err := r.db.Query(ctx, query, before, limit)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list detached expired sessions: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var sessions []RemoteSession
|
||||
for rows.Next() {
|
||||
item, err := scanRemoteSession(rows)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
sessions = append(sessions, *item)
|
||||
}
|
||||
return sessions, rows.Err()
|
||||
}
|
||||
|
||||
func (r *postgresRemoteSessionRepository) UpdateState(ctx context.Context, params UpdateRemoteSessionStateParams) error {
|
||||
const query = `
|
||||
UPDATE remote_sessions
|
||||
SET state = $2,
|
||||
worker_id = NULLIF($3, ''),
|
||||
detach_deadline_at = $4,
|
||||
last_heartbeat_at = $5,
|
||||
takeover_version = $6,
|
||||
updated_at = $7
|
||||
WHERE id = $1::uuid
|
||||
`
|
||||
if _, err := r.db.Exec(ctx, query,
|
||||
params.RemoteSessionID,
|
||||
params.State,
|
||||
params.WorkerID,
|
||||
params.DetachDeadlineAt,
|
||||
params.LastHeartbeatAt,
|
||||
params.TakeoverVersion,
|
||||
params.UpdatedAt,
|
||||
); err != nil {
|
||||
return fmt.Errorf("update remote session state: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *postgresSessionAttachmentRepository) Create(ctx context.Context, attachment SessionAttachment) error {
|
||||
const query = `
|
||||
INSERT INTO session_attachments (
|
||||
id, remote_session_id, user_id, device_id, role, state, superseded_by,
|
||||
takeover_of, attached_at, detached_at, last_input_at, metadata, created_at, updated_at
|
||||
) VALUES (
|
||||
$1::uuid, $2::uuid, $3::uuid, $4::uuid, $5, $6, NULLIF($7, '')::uuid,
|
||||
NULLIF($8, '')::uuid, $9, $10, $11, $12::jsonb, $13, $14
|
||||
)
|
||||
`
|
||||
if _, err := r.db.Exec(ctx, query,
|
||||
attachment.ID,
|
||||
attachment.RemoteSessionID,
|
||||
attachment.UserID,
|
||||
attachment.DeviceID,
|
||||
attachment.Role,
|
||||
attachment.State,
|
||||
stringValue(attachment.SupersededBy),
|
||||
stringValue(attachment.TakeoverOf),
|
||||
attachment.AttachedAt,
|
||||
attachment.DetachedAt,
|
||||
attachment.LastInputAt,
|
||||
jsonPayload(attachment.Metadata),
|
||||
attachment.CreatedAt,
|
||||
attachment.UpdatedAt,
|
||||
); err != nil {
|
||||
return fmt.Errorf("create session attachment: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *postgresSessionAttachmentRepository) GetByID(ctx context.Context, attachmentID string) (*SessionAttachment, error) {
|
||||
return r.getByID(ctx, attachmentID, "")
|
||||
}
|
||||
|
||||
func (r *postgresSessionAttachmentRepository) GetByIDForUpdate(ctx context.Context, attachmentID string) (*SessionAttachment, error) {
|
||||
return r.getByID(ctx, attachmentID, " FOR UPDATE")
|
||||
}
|
||||
|
||||
func (r *postgresSessionAttachmentRepository) getByID(ctx context.Context, attachmentID string, suffix string) (*SessionAttachment, error) {
|
||||
query := `
|
||||
SELECT id::text, remote_session_id::text, user_id::text, device_id::text, role, state,
|
||||
superseded_by::text, takeover_of::text, attached_at, detached_at, last_input_at, metadata, created_at, updated_at
|
||||
FROM session_attachments
|
||||
WHERE id = $1::uuid` + suffix
|
||||
attachment, err := scanSessionAttachment(r.db.QueryRow(ctx, query, attachmentID))
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
return nil, nil
|
||||
}
|
||||
return attachment, err
|
||||
}
|
||||
|
||||
func (r *postgresSessionAttachmentRepository) ListByRemoteSession(ctx context.Context, remoteSessionID string) ([]SessionAttachment, error) {
|
||||
return r.listByRemoteSession(ctx, remoteSessionID, "")
|
||||
}
|
||||
|
||||
func (r *postgresSessionAttachmentRepository) ListActiveByRemoteSessionForUpdate(ctx context.Context, remoteSessionID string) ([]SessionAttachment, error) {
|
||||
return r.listByRemoteSession(ctx, remoteSessionID, " AND state IN ('attaching', 'active', 'reconnecting') FOR UPDATE")
|
||||
}
|
||||
|
||||
func (r *postgresSessionAttachmentRepository) listByRemoteSession(ctx context.Context, remoteSessionID string, suffix string) ([]SessionAttachment, error) {
|
||||
query := `
|
||||
SELECT id::text, remote_session_id::text, user_id::text, device_id::text, role, state,
|
||||
superseded_by::text, takeover_of::text, attached_at, detached_at, last_input_at, metadata, created_at, updated_at
|
||||
FROM session_attachments
|
||||
WHERE remote_session_id = $1::uuid` + suffix
|
||||
rows, err := r.db.Query(ctx, query, remoteSessionID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("list session attachments: %w", err)
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var attachments []SessionAttachment
|
||||
for rows.Next() {
|
||||
item, err := scanSessionAttachment(rows)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
attachments = append(attachments, *item)
|
||||
}
|
||||
return attachments, rows.Err()
|
||||
}
|
||||
|
||||
func (r *postgresSessionAttachmentRepository) UpdateState(ctx context.Context, params UpdateSessionAttachmentStateParams) error {
|
||||
const query = `
|
||||
UPDATE session_attachments
|
||||
SET state = $2,
|
||||
detached_at = $3,
|
||||
last_input_at = $4,
|
||||
updated_at = $5
|
||||
WHERE id = $1::uuid
|
||||
`
|
||||
if _, err := r.db.Exec(ctx, query,
|
||||
params.AttachmentID,
|
||||
params.State,
|
||||
params.DetachedAt,
|
||||
params.LastInputAt,
|
||||
params.UpdatedAt,
|
||||
); err != nil {
|
||||
return fmt.Errorf("update session attachment state: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *postgresSessionAttachmentRepository) Supersede(ctx context.Context, params SupersedeAttachmentParams) error {
|
||||
const query = `
|
||||
UPDATE session_attachments
|
||||
SET state = 'superseded',
|
||||
superseded_by = $2::uuid,
|
||||
detached_at = $3,
|
||||
updated_at = $4
|
||||
WHERE id = $1::uuid
|
||||
`
|
||||
if _, err := r.db.Exec(ctx, query,
|
||||
params.PreviousAttachmentID,
|
||||
params.NextAttachmentID,
|
||||
params.DetachedAt,
|
||||
params.UpdatedAt,
|
||||
); err != nil {
|
||||
return fmt.Errorf("supersede attachment: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *postgresResourcePolicyRepository) GetByResourceID(ctx context.Context, resourceID string) (*ResourcePolicy, error) {
|
||||
const query = `
|
||||
SELECT resource_id::text, max_concurrent_sessions, takeover_policy, require_trusted_device,
|
||||
detach_grace_period_seconds, clipboard_enabled, clipboard_mode, file_transfer_enabled,
|
||||
COALESCE(file_transfer_mode, 'disabled'), created_at, updated_at
|
||||
FROM resource_policies
|
||||
WHERE resource_id = $1::uuid
|
||||
`
|
||||
policy, err := scanResourcePolicy(r.db.QueryRow(ctx, query, resourceID))
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
return nil, nil
|
||||
}
|
||||
return policy, err
|
||||
}
|
||||
|
||||
func (r *postgresResourcePolicyRepository) Upsert(ctx context.Context, policy ResourcePolicy) error {
|
||||
const query = `
|
||||
INSERT INTO resource_policies (
|
||||
resource_id, max_concurrent_sessions, takeover_policy, require_trusted_device,
|
||||
detach_grace_period_seconds, clipboard_enabled, clipboard_mode, file_transfer_enabled, file_transfer_mode, created_at, updated_at
|
||||
) VALUES ($1::uuid, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)
|
||||
ON CONFLICT (resource_id) DO UPDATE SET
|
||||
max_concurrent_sessions = EXCLUDED.max_concurrent_sessions,
|
||||
takeover_policy = EXCLUDED.takeover_policy,
|
||||
require_trusted_device = EXCLUDED.require_trusted_device,
|
||||
detach_grace_period_seconds = EXCLUDED.detach_grace_period_seconds,
|
||||
clipboard_enabled = EXCLUDED.clipboard_enabled,
|
||||
clipboard_mode = EXCLUDED.clipboard_mode,
|
||||
file_transfer_enabled = EXCLUDED.file_transfer_enabled,
|
||||
file_transfer_mode = EXCLUDED.file_transfer_mode,
|
||||
updated_at = EXCLUDED.updated_at
|
||||
`
|
||||
clipboardMode := normalizeClipboardMode(policy.ClipboardMode)
|
||||
fileTransferMode := normalizeFileTransferMode(policy.FileTransferMode)
|
||||
if _, err := r.db.Exec(ctx, query,
|
||||
policy.ResourceID,
|
||||
policy.MaxConcurrentSessions,
|
||||
policy.TakeoverPolicy,
|
||||
policy.RequireTrustedDevice,
|
||||
int(policy.DetachGracePeriod.Seconds()),
|
||||
clipboardMode != ResourceClipboardModeDisabled,
|
||||
clipboardMode,
|
||||
fileTransferAllowsClientToServer(fileTransferMode),
|
||||
fileTransferMode,
|
||||
policy.CreatedAt,
|
||||
policy.UpdatedAt,
|
||||
); err != nil {
|
||||
return fmt.Errorf("upsert resource policy: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *postgresResourceRuntimeRepository) GetByID(ctx context.Context, resourceID string) (*ResourceRuntimeSpec, error) {
|
||||
const query = `
|
||||
SELECT id::text, organization_id::text, name, address, protocol, secret_ref, certificate_verification_mode, metadata
|
||||
FROM resources
|
||||
WHERE id = $1::uuid
|
||||
`
|
||||
item := &ResourceRuntimeSpec{}
|
||||
var secretRef *string
|
||||
var metadata []byte
|
||||
if err := r.db.QueryRow(ctx, query, resourceID).Scan(
|
||||
&item.ID,
|
||||
&item.OrganizationID,
|
||||
&item.Name,
|
||||
&item.Address,
|
||||
&item.Protocol,
|
||||
&secretRef,
|
||||
&item.CertificateVerificationMode,
|
||||
&metadata,
|
||||
); err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, fmt.Errorf("get resource runtime spec: %w", err)
|
||||
}
|
||||
item.SecretRef = secretRef
|
||||
item.Metadata = metadata
|
||||
return item, nil
|
||||
}
|
||||
|
||||
func (r *postgresAuditEventRepository) Create(ctx context.Context, event AuditEvent) error {
|
||||
const query = `
|
||||
INSERT INTO audit_events (
|
||||
id, actor_user_id, actor_device_id, event_type, target_type, target_id,
|
||||
remote_session_id, payload, created_at
|
||||
) VALUES (
|
||||
$1::uuid, NULLIF($2, '')::uuid, NULLIF($3, '')::uuid, $4, $5, $6,
|
||||
NULLIF($7, '')::uuid, $8::jsonb, $9
|
||||
)
|
||||
`
|
||||
if _, err := r.db.Exec(ctx, query,
|
||||
event.ID,
|
||||
stringValue(event.ActorUserID),
|
||||
stringValue(event.ActorDeviceID),
|
||||
event.EventType,
|
||||
event.TargetType,
|
||||
event.TargetID,
|
||||
stringValue(event.RemoteSessionID),
|
||||
jsonPayload(event.Payload),
|
||||
event.CreatedAt,
|
||||
); err != nil {
|
||||
return fmt.Errorf("create audit event: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *postgresAccessRepository) IsTrustedDevice(ctx context.Context, userID, deviceID string) (bool, error) {
|
||||
const query = `
|
||||
SELECT EXISTS(
|
||||
SELECT 1 FROM devices
|
||||
WHERE id = $1::uuid AND user_id = $2::uuid AND trust_status = 'trusted' AND revoked_at IS NULL
|
||||
)
|
||||
`
|
||||
var trusted bool
|
||||
if err := r.db.QueryRow(ctx, query, deviceID, userID).Scan(&trusted); err != nil {
|
||||
return false, fmt.Errorf("check trusted device: %w", err)
|
||||
}
|
||||
return trusted, nil
|
||||
}
|
||||
|
||||
func (r *postgresAccessRepository) GetPlatformRole(ctx context.Context, userID string) (string, error) {
|
||||
return authority.EffectivePlatformRole(ctx, r.db, r.authority, userID)
|
||||
}
|
||||
|
||||
func (r *postgresAccessRepository) GetOrganizationRole(ctx context.Context, organizationID, userID string) (string, bool, error) {
|
||||
const query = `
|
||||
SELECT role_id
|
||||
FROM organization_memberships
|
||||
WHERE organization_id = $1::uuid AND user_id = $2::uuid AND status = 'active'
|
||||
`
|
||||
var role string
|
||||
if err := r.db.QueryRow(ctx, query, organizationID, userID).Scan(&role); err != nil {
|
||||
if errors.Is(err, pgx.ErrNoRows) {
|
||||
return "", false, nil
|
||||
}
|
||||
return "", false, fmt.Errorf("get organization role: %w", err)
|
||||
}
|
||||
return role, true, nil
|
||||
}
|
||||
|
||||
type scanner interface {
|
||||
Scan(dest ...any) error
|
||||
}
|
||||
|
||||
func scanRemoteSession(row scanner) (*RemoteSession, error) {
|
||||
item := &RemoteSession{}
|
||||
var detachDeadlineAt, lastHeartbeatAt *time.Time
|
||||
var metadata []byte
|
||||
if err := row.Scan(
|
||||
&item.ID,
|
||||
&item.OrganizationID,
|
||||
&item.ResourceID,
|
||||
&item.Protocol,
|
||||
&item.State,
|
||||
&item.WorkerID,
|
||||
&item.ControllerUserID,
|
||||
&detachDeadlineAt,
|
||||
&lastHeartbeatAt,
|
||||
&item.TakeoverVersion,
|
||||
&metadata,
|
||||
&item.CreatedAt,
|
||||
&item.UpdatedAt,
|
||||
); err != nil {
|
||||
return nil, fmt.Errorf("scan remote session: %w", err)
|
||||
}
|
||||
item.DetachDeadlineAt = detachDeadlineAt
|
||||
item.LastHeartbeatAt = lastHeartbeatAt
|
||||
item.Metadata = metadata
|
||||
item.RenderQualityProfile = renderQualityProfileFromSessionMetadata(metadata)
|
||||
return item, nil
|
||||
}
|
||||
|
||||
func scanSessionAttachment(row scanner) (*SessionAttachment, error) {
|
||||
item := &SessionAttachment{}
|
||||
var supersededBy, takeoverOf *string
|
||||
var attachedAt, detachedAt, lastInputAt *time.Time
|
||||
var metadata []byte
|
||||
if err := row.Scan(
|
||||
&item.ID,
|
||||
&item.RemoteSessionID,
|
||||
&item.UserID,
|
||||
&item.DeviceID,
|
||||
&item.Role,
|
||||
&item.State,
|
||||
&supersededBy,
|
||||
&takeoverOf,
|
||||
&attachedAt,
|
||||
&detachedAt,
|
||||
&lastInputAt,
|
||||
&metadata,
|
||||
&item.CreatedAt,
|
||||
&item.UpdatedAt,
|
||||
); err != nil {
|
||||
return nil, fmt.Errorf("scan session attachment: %w", err)
|
||||
}
|
||||
item.SupersededBy = supersededBy
|
||||
item.TakeoverOf = takeoverOf
|
||||
item.AttachedAt = attachedAt
|
||||
item.DetachedAt = detachedAt
|
||||
item.LastInputAt = lastInputAt
|
||||
item.Metadata = metadata
|
||||
return item, nil
|
||||
}
|
||||
|
||||
func scanResourcePolicy(row scanner) (*ResourcePolicy, error) {
|
||||
item := &ResourcePolicy{}
|
||||
var detachGraceSeconds int
|
||||
if err := row.Scan(
|
||||
&item.ResourceID,
|
||||
&item.MaxConcurrentSessions,
|
||||
&item.TakeoverPolicy,
|
||||
&item.RequireTrustedDevice,
|
||||
&detachGraceSeconds,
|
||||
&item.ClipboardEnabled,
|
||||
&item.ClipboardMode,
|
||||
&item.FileTransferEnabled,
|
||||
&item.FileTransferMode,
|
||||
&item.CreatedAt,
|
||||
&item.UpdatedAt,
|
||||
); err != nil {
|
||||
return nil, fmt.Errorf("scan resource policy: %w", err)
|
||||
}
|
||||
item.DetachGracePeriod = time.Duration(detachGraceSeconds) * time.Second
|
||||
item.ClipboardMode = normalizeClipboardMode(item.ClipboardMode)
|
||||
item.ClipboardEnabled = item.ClipboardMode != ResourceClipboardModeDisabled
|
||||
item.FileTransferMode = normalizeFileTransferMode(item.FileTransferMode)
|
||||
item.FileTransferEnabled = fileTransferAllowsClientToServer(item.FileTransferMode)
|
||||
return item, nil
|
||||
}
|
||||
|
||||
func jsonPayload(payload []byte) []byte {
|
||||
if len(payload) == 0 {
|
||||
return []byte(`{}`)
|
||||
}
|
||||
if json.Valid(payload) {
|
||||
return payload
|
||||
}
|
||||
return []byte(`{}`)
|
||||
}
|
||||
|
||||
func stringValue(value *string) string {
|
||||
if value == nil {
|
||||
return ""
|
||||
}
|
||||
return *value
|
||||
}
|
||||
@@ -0,0 +1,140 @@
|
||||
package sessionbroker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/redis/go-redis/v9"
|
||||
|
||||
sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session"
|
||||
)
|
||||
|
||||
type RedisLiveStateStore struct {
|
||||
client *redis.Client
|
||||
}
|
||||
|
||||
func NewRedisLiveStateStore(client *redis.Client) *RedisLiveStateStore {
|
||||
return &RedisLiveStateStore{client: client}
|
||||
}
|
||||
|
||||
func (s *RedisLiveStateStore) UpsertSession(ctx context.Context, state LiveSessionState) error {
|
||||
return s.setJSON(ctx, liveSessionKey(state.SessionID), state, 0)
|
||||
}
|
||||
|
||||
func (s *RedisLiveStateStore) GetSession(ctx context.Context, sessionID string) (*LiveSessionState, error) {
|
||||
var state LiveSessionState
|
||||
ok, err := s.getJSON(ctx, liveSessionKey(sessionID), &state)
|
||||
if err != nil || !ok {
|
||||
return nil, err
|
||||
}
|
||||
return &state, nil
|
||||
}
|
||||
|
||||
func (s *RedisLiveStateStore) DeleteSession(ctx context.Context, sessionID string) error {
|
||||
return s.client.Del(ctx, liveSessionKey(sessionID)).Err()
|
||||
}
|
||||
|
||||
func (s *RedisLiveStateStore) BindController(ctx context.Context, binding sessioncontracts.ControllerBinding, ttl time.Duration) error {
|
||||
return s.setJSON(ctx, controllerBindingKey(binding.SessionID), binding, ttl)
|
||||
}
|
||||
|
||||
func (s *RedisLiveStateStore) GetControllerBinding(ctx context.Context, sessionID string) (*sessioncontracts.ControllerBinding, error) {
|
||||
var binding sessioncontracts.ControllerBinding
|
||||
ok, err := s.getJSON(ctx, controllerBindingKey(sessionID), &binding)
|
||||
if err != nil || !ok {
|
||||
return nil, err
|
||||
}
|
||||
return &binding, nil
|
||||
}
|
||||
|
||||
func (s *RedisLiveStateStore) ClearControllerBinding(ctx context.Context, sessionID string) error {
|
||||
return s.client.Del(ctx, controllerBindingKey(sessionID)).Err()
|
||||
}
|
||||
|
||||
func (s *RedisLiveStateStore) StoreAttachToken(ctx context.Context, claims sessioncontracts.AttachTokenClaims, ttl time.Duration) error {
|
||||
return s.setJSON(ctx, attachTokenKey(claims.Token), claims, ttl)
|
||||
}
|
||||
|
||||
func (s *RedisLiveStateStore) ConsumeAttachToken(ctx context.Context, token string) (*sessioncontracts.AttachTokenClaims, error) {
|
||||
key := attachTokenKey(token)
|
||||
payload, err := s.client.GetDel(ctx, key).Result()
|
||||
if err == redis.Nil {
|
||||
return nil, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("consume attach token: %w", err)
|
||||
}
|
||||
var claims sessioncontracts.AttachTokenClaims
|
||||
if err := json.Unmarshal([]byte(payload), &claims); err != nil {
|
||||
return nil, fmt.Errorf("decode attach token: %w", err)
|
||||
}
|
||||
return &claims, nil
|
||||
}
|
||||
|
||||
func (s *RedisLiveStateStore) TouchAttachmentHeartbeat(ctx context.Context, sessionID, attachmentID string, ttl time.Duration) error {
|
||||
return s.client.Set(ctx, attachmentHeartbeatKey(sessionID, attachmentID), time.Now().UTC().Format(time.RFC3339Nano), ttl).Err()
|
||||
}
|
||||
|
||||
func (s *RedisLiveStateStore) UpdateWorkerRoute(ctx context.Context, route WorkerRoute, ttl time.Duration) error {
|
||||
return s.setJSON(ctx, workerRouteKey(route.SessionID), route, ttl)
|
||||
}
|
||||
|
||||
func (s *RedisLiveStateStore) GetWorkerRoute(ctx context.Context, sessionID string) (*WorkerRoute, error) {
|
||||
var route WorkerRoute
|
||||
ok, err := s.getJSON(ctx, workerRouteKey(sessionID), &route)
|
||||
if err != nil || !ok {
|
||||
return nil, err
|
||||
}
|
||||
return &route, nil
|
||||
}
|
||||
|
||||
func (s *RedisLiveStateStore) DeleteWorkerRoute(ctx context.Context, sessionID string) error {
|
||||
return s.client.Del(ctx, workerRouteKey(sessionID)).Err()
|
||||
}
|
||||
|
||||
func (s *RedisLiveStateStore) setJSON(ctx context.Context, key string, value any, ttl time.Duration) error {
|
||||
payload, err := json.Marshal(value)
|
||||
if err != nil {
|
||||
return fmt.Errorf("encode redis payload: %w", err)
|
||||
}
|
||||
if err := s.client.Set(ctx, key, payload, ttl).Err(); err != nil {
|
||||
return fmt.Errorf("set redis key %s: %w", key, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *RedisLiveStateStore) getJSON(ctx context.Context, key string, dest any) (bool, error) {
|
||||
payload, err := s.client.Get(ctx, key).Result()
|
||||
if err == redis.Nil {
|
||||
return false, nil
|
||||
}
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("get redis key %s: %w", key, err)
|
||||
}
|
||||
if err := json.Unmarshal([]byte(payload), dest); err != nil {
|
||||
return false, fmt.Errorf("decode redis key %s: %w", key, err)
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
||||
func liveSessionKey(sessionID string) string {
|
||||
return "live:session:" + sessionID
|
||||
}
|
||||
|
||||
func controllerBindingKey(sessionID string) string {
|
||||
return "live:session:" + sessionID + ":controller"
|
||||
}
|
||||
|
||||
func attachTokenKey(token string) string {
|
||||
return "live:attach:" + token
|
||||
}
|
||||
|
||||
func attachmentHeartbeatKey(sessionID, attachmentID string) string {
|
||||
return "live:session:" + sessionID + ":attachment:" + attachmentID + ":heartbeat"
|
||||
}
|
||||
|
||||
func workerRouteKey(sessionID string) string {
|
||||
return "live:session:" + sessionID + ":worker-route"
|
||||
}
|
||||
@@ -0,0 +1,85 @@
|
||||
package sessionbroker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session"
|
||||
)
|
||||
|
||||
type RemoteSessionRepository interface {
|
||||
Create(ctx context.Context, session RemoteSession) error
|
||||
GetByID(ctx context.Context, sessionID string) (*RemoteSession, error)
|
||||
GetByIDForUpdate(ctx context.Context, sessionID string) (*RemoteSession, error)
|
||||
ListByController(ctx context.Context, userID string) ([]RemoteSession, error)
|
||||
CountLiveByResource(ctx context.Context, resourceID string) (int, error)
|
||||
ListDetachedExpired(ctx context.Context, before time.Time, limit int) ([]RemoteSession, error)
|
||||
UpdateState(ctx context.Context, params UpdateRemoteSessionStateParams) error
|
||||
}
|
||||
|
||||
type SessionAttachmentRepository interface {
|
||||
Create(ctx context.Context, attachment SessionAttachment) error
|
||||
GetByID(ctx context.Context, attachmentID string) (*SessionAttachment, error)
|
||||
GetByIDForUpdate(ctx context.Context, attachmentID string) (*SessionAttachment, error)
|
||||
ListByRemoteSession(ctx context.Context, remoteSessionID string) ([]SessionAttachment, error)
|
||||
ListActiveByRemoteSessionForUpdate(ctx context.Context, remoteSessionID string) ([]SessionAttachment, error)
|
||||
UpdateState(ctx context.Context, params UpdateSessionAttachmentStateParams) error
|
||||
Supersede(ctx context.Context, params SupersedeAttachmentParams) error
|
||||
}
|
||||
|
||||
type ResourcePolicyRepository interface {
|
||||
GetByResourceID(ctx context.Context, resourceID string) (*ResourcePolicy, error)
|
||||
Upsert(ctx context.Context, policy ResourcePolicy) error
|
||||
}
|
||||
|
||||
type AuditEventRepository interface {
|
||||
Create(ctx context.Context, event AuditEvent) error
|
||||
}
|
||||
|
||||
type Store interface {
|
||||
RemoteSessions() RemoteSessionRepository
|
||||
SessionAttachments() SessionAttachmentRepository
|
||||
ResourcePolicies() ResourcePolicyRepository
|
||||
ResourceRuntime() ResourceRuntimeRepository
|
||||
AuditEvents() AuditEventRepository
|
||||
Access() AccessRepository
|
||||
}
|
||||
|
||||
type Transactor interface {
|
||||
WithinTransaction(ctx context.Context, fn func(store Store) error) error
|
||||
}
|
||||
|
||||
type UpdateRemoteSessionStateParams struct {
|
||||
RemoteSessionID string
|
||||
State sessioncontracts.State
|
||||
WorkerID string
|
||||
DetachDeadlineAt *time.Time
|
||||
LastHeartbeatAt *time.Time
|
||||
TakeoverVersion int
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type UpdateSessionAttachmentStateParams struct {
|
||||
AttachmentID string
|
||||
State AttachmentState
|
||||
DetachedAt *time.Time
|
||||
LastInputAt *time.Time
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type SupersedeAttachmentParams struct {
|
||||
PreviousAttachmentID string
|
||||
NextAttachmentID string
|
||||
DetachedAt time.Time
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
type AccessRepository interface {
|
||||
IsTrustedDevice(ctx context.Context, userID, deviceID string) (bool, error)
|
||||
GetPlatformRole(ctx context.Context, userID string) (string, error)
|
||||
GetOrganizationRole(ctx context.Context, organizationID, userID string) (string, bool, error)
|
||||
}
|
||||
|
||||
type ResourceRuntimeRepository interface {
|
||||
GetByID(ctx context.Context, resourceID string) (*ResourceRuntimeSpec, error)
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
package sessionbroker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"testing"
|
||||
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/config"
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/module"
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/secrets"
|
||||
workercontracts "github.com/example/remote-access-platform/backend/pkg/contracts/worker"
|
||||
)
|
||||
|
||||
type fakeSecretResolver struct {
|
||||
response *secrets.ResolvedResourceSecret
|
||||
err error
|
||||
request secrets.ResolveResourceSecretRequest
|
||||
}
|
||||
|
||||
func testAppConfig(env string) config.AppConfig {
|
||||
return config.AppConfig{Name: "rap-api-test", Env: env}
|
||||
}
|
||||
|
||||
func (r *fakeSecretResolver) ResolveForSession(_ context.Context, req secrets.ResolveResourceSecretRequest) (*secrets.ResolvedResourceSecret, error) {
|
||||
r.request = req
|
||||
if r.err != nil {
|
||||
return nil, r.err
|
||||
}
|
||||
return r.response, nil
|
||||
}
|
||||
|
||||
func TestRuntimeAssignmentMetadataMergesResolvedSecretWithoutMutatingSessionMetadata(t *testing.T) {
|
||||
resolver := &fakeSecretResolver{
|
||||
response: &secrets.ResolvedResourceSecret{
|
||||
Descriptor: secrets.ResourceSecretDescriptor{Version: 3},
|
||||
Payload: json.RawMessage(`{"username":"user","password":"secret","domain":"corp"}`),
|
||||
},
|
||||
}
|
||||
service := NewService(module.Dependencies{
|
||||
Config: module.Config{App: testAppConfig("production")},
|
||||
}, nil, nil, nil, nil, resolver)
|
||||
sessionMetadata := mustJSON(t, map[string]any{
|
||||
"resource": map[string]any{
|
||||
"id": "resource-1",
|
||||
"organization_id": "org-1",
|
||||
"secret_ref": "rap-secret://org/org-1/resources/resource-1/primary",
|
||||
"metadata": map[string]any{
|
||||
"rdp_host": "host",
|
||||
},
|
||||
},
|
||||
})
|
||||
session := RemoteSession{
|
||||
ID: "session-1",
|
||||
OrganizationID: "org-1",
|
||||
ResourceID: "resource-1",
|
||||
WorkerID: "worker-1",
|
||||
Metadata: sessionMetadata,
|
||||
}
|
||||
metadata, secretRef, version, err := service.runtimeAssignmentMetadata(context.Background(), session, &workercontracts.WorkerLease{LeaseID: "lease-1"})
|
||||
if err != nil {
|
||||
t.Fatalf("runtimeAssignmentMetadata returned error: %v", err)
|
||||
}
|
||||
if secretRef == "" || version != 3 {
|
||||
t.Fatalf("expected secret ref and version, got ref=%q version=%d", secretRef, version)
|
||||
}
|
||||
resource := metadata["resource"].(map[string]any)
|
||||
resourceMetadata := resource["metadata"].(map[string]any)
|
||||
if resourceMetadata["username"] != "user" || resourceMetadata["password"] != "secret" || resourceMetadata["domain"] != "corp" {
|
||||
t.Fatalf("resolved secret was not merged: %#v", resourceMetadata)
|
||||
}
|
||||
var persisted map[string]any
|
||||
if err := json.Unmarshal(session.Metadata, &persisted); err != nil {
|
||||
t.Fatalf("decode persisted metadata: %v", err)
|
||||
}
|
||||
persistedResource := persisted["resource"].(map[string]any)
|
||||
persistedMetadata := persistedResource["metadata"].(map[string]any)
|
||||
if _, ok := persistedMetadata["password"]; ok {
|
||||
t.Fatalf("session metadata was mutated with plaintext secret")
|
||||
}
|
||||
if resolver.request.LeaseID != "lease-1" || resolver.request.WorkerID != "worker-1" {
|
||||
t.Fatalf("resolver request missed lease/worker proof: %#v", resolver.request)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeAssignmentMetadataRequiresResolverInProduction(t *testing.T) {
|
||||
service := NewService(module.Dependencies{
|
||||
Config: module.Config{App: testAppConfig("production")},
|
||||
}, nil, nil, nil, nil)
|
||||
session := RemoteSession{
|
||||
ID: "session-1",
|
||||
OrganizationID: "org-1",
|
||||
ResourceID: "resource-1",
|
||||
WorkerID: "worker-1",
|
||||
Metadata: mustJSON(t, map[string]any{
|
||||
"resource": map[string]any{
|
||||
"secret_ref": "rap-secret://org/org-1/resources/resource-1/primary",
|
||||
},
|
||||
}),
|
||||
}
|
||||
_, _, _, err := service.runtimeAssignmentMetadata(context.Background(), session, &workercontracts.WorkerLease{LeaseID: "lease-1"})
|
||||
if !errors.Is(err, secrets.ErrSecretEncryptionKeyMissing) {
|
||||
t.Fatalf("expected missing resolver error, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRuntimeAssignmentMetadataAllowsDevelopmentMetadataWithoutResolver(t *testing.T) {
|
||||
service := NewService(module.Dependencies{
|
||||
Config: module.Config{App: testAppConfig("development")},
|
||||
}, nil, nil, nil, nil)
|
||||
session := RemoteSession{
|
||||
ID: "session-1",
|
||||
OrganizationID: "org-1",
|
||||
ResourceID: "resource-1",
|
||||
WorkerID: "worker-1",
|
||||
Metadata: mustJSON(t, map[string]any{
|
||||
"resource": map[string]any{
|
||||
"secret_ref": "rap-secret://org/org-1/resources/resource-1/primary",
|
||||
"metadata": map[string]any{
|
||||
"username": "dev-user",
|
||||
"password": "dev-password",
|
||||
},
|
||||
},
|
||||
}),
|
||||
}
|
||||
metadata, secretRef, _, err := service.runtimeAssignmentMetadata(context.Background(), session, nil)
|
||||
if err != nil {
|
||||
t.Fatalf("development metadata should not require resolver: %v", err)
|
||||
}
|
||||
if secretRef != "" {
|
||||
t.Fatalf("development fallback should not audit resolver use, got %q", secretRef)
|
||||
}
|
||||
resource := metadata["resource"].(map[string]any)
|
||||
resourceMetadata := resource["metadata"].(map[string]any)
|
||||
if resourceMetadata["password"] != "dev-password" {
|
||||
t.Fatalf("development metadata was not preserved")
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,391 @@
|
||||
package sessionbroker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"io"
|
||||
"log/slog"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/backend/internal/platform/module"
|
||||
sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session"
|
||||
workercontracts "github.com/example/remote-access-platform/backend/pkg/contracts/worker"
|
||||
)
|
||||
|
||||
func TestHandleWorkerConnectedIgnoresTerminalSession(t *testing.T) {
|
||||
service, store, live, _ := newStaleWorkerEventTestService()
|
||||
store.remote.sessions["session-1"] = RemoteSession{
|
||||
ID: "session-1",
|
||||
State: sessioncontracts.StateTerminated,
|
||||
WorkerID: "worker-1",
|
||||
}
|
||||
|
||||
if err := service.HandleWorkerConnected(context.Background(), "session-1"); err != nil {
|
||||
t.Fatalf("HandleWorkerConnected returned error for stale terminal event: %v", err)
|
||||
}
|
||||
if got := store.remote.sessions["session-1"].State; got != sessioncontracts.StateTerminated {
|
||||
t.Fatalf("stale connected event changed terminal state to %q", got)
|
||||
}
|
||||
if store.remote.updateCount != 0 {
|
||||
t.Fatalf("stale connected event updated authoritative session %d times", store.remote.updateCount)
|
||||
}
|
||||
if live.upsertCount != 0 {
|
||||
t.Fatalf("stale connected event recreated live state %d times", live.upsertCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateWorkerRenderTelemetryIgnoresTerminalSession(t *testing.T) {
|
||||
service, store, live, _ := newStaleWorkerEventTestService()
|
||||
store.remote.sessions["session-1"] = RemoteSession{
|
||||
ID: "session-1",
|
||||
State: sessioncontracts.StateTerminated,
|
||||
WorkerID: "worker-1",
|
||||
}
|
||||
|
||||
err := service.UpdateWorkerRenderTelemetry(context.Background(), "session-1", map[string]any{
|
||||
"render_state": "ready",
|
||||
"width": 1280,
|
||||
"height": 720,
|
||||
"frame_sequence": int64(99),
|
||||
"frame_data": "stale-frame",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("UpdateWorkerRenderTelemetry returned error for stale terminal event: %v", err)
|
||||
}
|
||||
if live.upsertCount != 0 {
|
||||
t.Fatalf("stale render event recreated live state %d times", live.upsertCount)
|
||||
}
|
||||
if live.sessions["session-1"] != nil {
|
||||
t.Fatalf("stale render event left live state behind: %#v", live.sessions["session-1"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestMarkSessionFailedTransitionsActiveSession(t *testing.T) {
|
||||
service, store, live, orchestrator := newStaleWorkerEventTestService()
|
||||
store.remote.sessions["session-1"] = RemoteSession{
|
||||
ID: "session-1",
|
||||
State: sessioncontracts.StateActive,
|
||||
WorkerID: "worker-1",
|
||||
TakeoverVersion: 3,
|
||||
}
|
||||
store.attachments.items["attachment-1"] = SessionAttachment{
|
||||
ID: "attachment-1",
|
||||
RemoteSessionID: "session-1",
|
||||
State: AttachmentStateActive,
|
||||
}
|
||||
live.sessions["session-1"] = &LiveSessionState{SessionID: "session-1", State: sessioncontracts.StateActive}
|
||||
|
||||
if err := service.MarkSessionFailed(context.Background(), MarkSessionFailedCommand{SessionID: "session-1", Reason: "worker_lost"}); err != nil {
|
||||
t.Fatalf("MarkSessionFailed returned error: %v", err)
|
||||
}
|
||||
if got := store.remote.sessions["session-1"].State; got != sessioncontracts.StateFailed {
|
||||
t.Fatalf("expected failed state, got %q", got)
|
||||
}
|
||||
if got := store.attachments.items["attachment-1"].State; got != AttachmentStateClosed {
|
||||
t.Fatalf("expected attachment closed, got %q", got)
|
||||
}
|
||||
if store.audit.createCount != 1 {
|
||||
t.Fatalf("expected one audit event, got %d", store.audit.createCount)
|
||||
}
|
||||
if live.sessions["session-1"] != nil {
|
||||
t.Fatal("expected failed session live state to be deleted")
|
||||
}
|
||||
if orchestrator.releaseCount != 1 {
|
||||
t.Fatalf("expected session lease release, got %d", orchestrator.releaseCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMarkSessionFailedAlreadyFailedIsIdempotent(t *testing.T) {
|
||||
service, store, _, _ := newStaleWorkerEventTestService()
|
||||
store.remote.sessions["session-1"] = RemoteSession{
|
||||
ID: "session-1",
|
||||
State: sessioncontracts.StateFailed,
|
||||
WorkerID: "worker-1",
|
||||
TakeoverVersion: 1,
|
||||
}
|
||||
|
||||
if err := service.MarkSessionFailed(context.Background(), MarkSessionFailedCommand{SessionID: "session-1", Reason: "duplicate_worker_failure"}); err != nil {
|
||||
t.Fatalf("duplicate MarkSessionFailed returned error: %v", err)
|
||||
}
|
||||
if got := store.remote.sessions["session-1"].State; got != sessioncontracts.StateFailed {
|
||||
t.Fatalf("duplicate terminal event changed state to %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func newStaleWorkerEventTestService() (*Service, *staleWorkerEventTestStore, *staleWorkerEventLiveState, *staleWorkerEventOrchestrator) {
|
||||
store := &staleWorkerEventTestStore{
|
||||
remote: &staleWorkerEventRemoteSessions{sessions: map[string]RemoteSession{}},
|
||||
attachments: &staleWorkerEventAttachments{items: map[string]SessionAttachment{}},
|
||||
policies: &staleWorkerEventPolicies{},
|
||||
audit: &staleWorkerEventAudit{},
|
||||
}
|
||||
live := &staleWorkerEventLiveState{sessions: map[string]*LiveSessionState{}}
|
||||
orchestrator := &staleWorkerEventOrchestrator{}
|
||||
service := NewService(module.Dependencies{
|
||||
Infra: module.Infra{Logger: slog.New(slog.NewTextHandler(io.Discard, nil))},
|
||||
}, store, staleWorkerEventTransactor{store: store}, live, orchestrator)
|
||||
service.now = func() time.Time { return time.Unix(100, 0).UTC() }
|
||||
return service, store, live, orchestrator
|
||||
}
|
||||
|
||||
type staleWorkerEventTransactor struct {
|
||||
store Store
|
||||
}
|
||||
|
||||
func (t staleWorkerEventTransactor) WithinTransaction(ctx context.Context, fn func(store Store) error) error {
|
||||
return fn(t.store)
|
||||
}
|
||||
|
||||
type staleWorkerEventTestStore struct {
|
||||
remote *staleWorkerEventRemoteSessions
|
||||
attachments *staleWorkerEventAttachments
|
||||
policies *staleWorkerEventPolicies
|
||||
audit *staleWorkerEventAudit
|
||||
}
|
||||
|
||||
func (s *staleWorkerEventTestStore) RemoteSessions() RemoteSessionRepository { return s.remote }
|
||||
func (s *staleWorkerEventTestStore) SessionAttachments() SessionAttachmentRepository {
|
||||
return s.attachments
|
||||
}
|
||||
func (s *staleWorkerEventTestStore) ResourcePolicies() ResourcePolicyRepository { return s.policies }
|
||||
func (s *staleWorkerEventTestStore) ResourceRuntime() ResourceRuntimeRepository {
|
||||
return staleWorkerEventResourceRuntime{}
|
||||
}
|
||||
func (s *staleWorkerEventTestStore) AuditEvents() AuditEventRepository { return s.audit }
|
||||
func (s *staleWorkerEventTestStore) Access() AccessRepository { return staleWorkerEventAccess{} }
|
||||
|
||||
type staleWorkerEventRemoteSessions struct {
|
||||
sessions map[string]RemoteSession
|
||||
updateCount int
|
||||
}
|
||||
|
||||
func (r *staleWorkerEventRemoteSessions) Create(_ context.Context, session RemoteSession) error {
|
||||
r.sessions[session.ID] = session
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *staleWorkerEventRemoteSessions) GetByID(_ context.Context, sessionID string) (*RemoteSession, error) {
|
||||
session, ok := r.sessions[sessionID]
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
return &session, nil
|
||||
}
|
||||
|
||||
func (r *staleWorkerEventRemoteSessions) GetByIDForUpdate(ctx context.Context, sessionID string) (*RemoteSession, error) {
|
||||
return r.GetByID(ctx, sessionID)
|
||||
}
|
||||
|
||||
func (r *staleWorkerEventRemoteSessions) ListByController(_ context.Context, _ string) ([]RemoteSession, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (r *staleWorkerEventRemoteSessions) CountLiveByResource(_ context.Context, _ string) (int, error) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func (r *staleWorkerEventRemoteSessions) ListDetachedExpired(_ context.Context, _ time.Time, _ int) ([]RemoteSession, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (r *staleWorkerEventRemoteSessions) UpdateState(_ context.Context, params UpdateRemoteSessionStateParams) error {
|
||||
session := r.sessions[params.RemoteSessionID]
|
||||
session.State = params.State
|
||||
session.WorkerID = params.WorkerID
|
||||
session.DetachDeadlineAt = params.DetachDeadlineAt
|
||||
session.LastHeartbeatAt = params.LastHeartbeatAt
|
||||
session.TakeoverVersion = params.TakeoverVersion
|
||||
session.UpdatedAt = params.UpdatedAt
|
||||
r.sessions[params.RemoteSessionID] = session
|
||||
r.updateCount++
|
||||
return nil
|
||||
}
|
||||
|
||||
type staleWorkerEventAttachments struct {
|
||||
items map[string]SessionAttachment
|
||||
}
|
||||
|
||||
func (r *staleWorkerEventAttachments) Create(_ context.Context, attachment SessionAttachment) error {
|
||||
r.items[attachment.ID] = attachment
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *staleWorkerEventAttachments) GetByID(_ context.Context, attachmentID string) (*SessionAttachment, error) {
|
||||
attachment, ok := r.items[attachmentID]
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
return &attachment, nil
|
||||
}
|
||||
|
||||
func (r *staleWorkerEventAttachments) GetByIDForUpdate(ctx context.Context, attachmentID string) (*SessionAttachment, error) {
|
||||
return r.GetByID(ctx, attachmentID)
|
||||
}
|
||||
|
||||
func (r *staleWorkerEventAttachments) ListByRemoteSession(_ context.Context, remoteSessionID string) ([]SessionAttachment, error) {
|
||||
attachments := make([]SessionAttachment, 0)
|
||||
for _, attachment := range r.items {
|
||||
if attachment.RemoteSessionID == remoteSessionID {
|
||||
attachments = append(attachments, attachment)
|
||||
}
|
||||
}
|
||||
return attachments, nil
|
||||
}
|
||||
|
||||
func (r *staleWorkerEventAttachments) ListActiveByRemoteSessionForUpdate(ctx context.Context, remoteSessionID string) ([]SessionAttachment, error) {
|
||||
return r.ListByRemoteSession(ctx, remoteSessionID)
|
||||
}
|
||||
|
||||
func (r *staleWorkerEventAttachments) UpdateState(_ context.Context, params UpdateSessionAttachmentStateParams) error {
|
||||
attachment := r.items[params.AttachmentID]
|
||||
attachment.State = params.State
|
||||
attachment.DetachedAt = params.DetachedAt
|
||||
attachment.LastInputAt = params.LastInputAt
|
||||
attachment.UpdatedAt = params.UpdatedAt
|
||||
r.items[params.AttachmentID] = attachment
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *staleWorkerEventAttachments) Supersede(_ context.Context, params SupersedeAttachmentParams) error {
|
||||
attachment := r.items[params.PreviousAttachmentID]
|
||||
attachment.State = AttachmentStateSuperseded
|
||||
attachment.SupersededBy = ¶ms.NextAttachmentID
|
||||
attachment.DetachedAt = ¶ms.DetachedAt
|
||||
attachment.UpdatedAt = params.UpdatedAt
|
||||
r.items[params.PreviousAttachmentID] = attachment
|
||||
return nil
|
||||
}
|
||||
|
||||
type staleWorkerEventPolicies struct{}
|
||||
|
||||
func (r *staleWorkerEventPolicies) GetByResourceID(_ context.Context, _ string) (*ResourcePolicy, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (r *staleWorkerEventPolicies) Upsert(_ context.Context, _ ResourcePolicy) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
type staleWorkerEventAudit struct {
|
||||
createCount int
|
||||
}
|
||||
|
||||
func (r *staleWorkerEventAudit) Create(_ context.Context, _ AuditEvent) error {
|
||||
r.createCount++
|
||||
return nil
|
||||
}
|
||||
|
||||
type staleWorkerEventResourceRuntime struct{}
|
||||
|
||||
func (staleWorkerEventResourceRuntime) GetByID(_ context.Context, _ string) (*ResourceRuntimeSpec, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
type staleWorkerEventAccess struct{}
|
||||
|
||||
func (staleWorkerEventAccess) IsTrustedDevice(_ context.Context, _, _ string) (bool, error) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
func (staleWorkerEventAccess) GetPlatformRole(_ context.Context, _ string) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (staleWorkerEventAccess) GetOrganizationRole(_ context.Context, _, _ string) (string, bool, error) {
|
||||
return "", false, nil
|
||||
}
|
||||
|
||||
type staleWorkerEventLiveState struct {
|
||||
sessions map[string]*LiveSessionState
|
||||
upsertCount int
|
||||
}
|
||||
|
||||
func (s *staleWorkerEventLiveState) UpsertSession(_ context.Context, state LiveSessionState) error {
|
||||
copied := state
|
||||
s.sessions[state.SessionID] = &copied
|
||||
s.upsertCount++
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *staleWorkerEventLiveState) GetSession(_ context.Context, sessionID string) (*LiveSessionState, error) {
|
||||
state := s.sessions[sessionID]
|
||||
if state == nil {
|
||||
return nil, nil
|
||||
}
|
||||
copied := *state
|
||||
return &copied, nil
|
||||
}
|
||||
|
||||
func (s *staleWorkerEventLiveState) DeleteSession(_ context.Context, sessionID string) error {
|
||||
delete(s.sessions, sessionID)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *staleWorkerEventLiveState) BindController(_ context.Context, _ sessioncontracts.ControllerBinding, _ time.Duration) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *staleWorkerEventLiveState) GetControllerBinding(_ context.Context, _ string) (*sessioncontracts.ControllerBinding, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (s *staleWorkerEventLiveState) ClearControllerBinding(_ context.Context, _ string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *staleWorkerEventLiveState) StoreAttachToken(_ context.Context, _ sessioncontracts.AttachTokenClaims, _ time.Duration) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *staleWorkerEventLiveState) ConsumeAttachToken(_ context.Context, _ string) (*sessioncontracts.AttachTokenClaims, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (s *staleWorkerEventLiveState) TouchAttachmentHeartbeat(_ context.Context, _, _ string, _ time.Duration) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *staleWorkerEventLiveState) UpdateWorkerRoute(_ context.Context, _ WorkerRoute, _ time.Duration) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *staleWorkerEventLiveState) GetWorkerRoute(_ context.Context, _ string) (*WorkerRoute, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (s *staleWorkerEventLiveState) DeleteWorkerRoute(_ context.Context, _ string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
type staleWorkerEventOrchestrator struct {
|
||||
releaseCount int
|
||||
}
|
||||
|
||||
func (o *staleWorkerEventOrchestrator) Reserve(_ context.Context, _ workercontracts.AttachRequest) (*workercontracts.WorkerLease, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (o *staleWorkerEventOrchestrator) GetSessionLease(_ context.Context, _ string) (*workercontracts.WorkerLease, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (o *staleWorkerEventOrchestrator) ReleaseSessionLease(_ context.Context, _ string) error {
|
||||
o.releaseCount++
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *staleWorkerEventOrchestrator) PrepareAttachment(_ context.Context, _ RemoteSession, _ SessionAttachment, _ map[string]any) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *staleWorkerEventOrchestrator) NotifyDetachment(_ context.Context, _ RemoteSession, _ SessionAttachment) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *staleWorkerEventOrchestrator) TerminateRemoteSession(_ context.Context, _, _ string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *staleWorkerEventOrchestrator) ValidateSessionRuntime(_ context.Context, _, _ string) (bool, string, error) {
|
||||
return true, "", nil
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
package sessionbroker
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
sessioncontracts "github.com/example/remote-access-platform/backend/pkg/contracts/session"
|
||||
)
|
||||
|
||||
var allowedTransitions = map[sessioncontracts.State]map[sessioncontracts.State]struct{}{
|
||||
sessioncontracts.StateStarting: {
|
||||
sessioncontracts.StateActive: {},
|
||||
sessioncontracts.StateFailed: {},
|
||||
sessioncontracts.StateTerminated: {},
|
||||
},
|
||||
sessioncontracts.StateActive: {
|
||||
sessioncontracts.StateDetached: {},
|
||||
sessioncontracts.StateReconnecting: {},
|
||||
sessioncontracts.StateFailed: {},
|
||||
sessioncontracts.StateTerminated: {},
|
||||
},
|
||||
sessioncontracts.StateDetached: {
|
||||
sessioncontracts.StateReconnecting: {},
|
||||
sessioncontracts.StateTerminated: {},
|
||||
sessioncontracts.StateFailed: {},
|
||||
},
|
||||
sessioncontracts.StateReconnecting: {
|
||||
sessioncontracts.StateActive: {},
|
||||
sessioncontracts.StateDetached: {},
|
||||
sessioncontracts.StateFailed: {},
|
||||
sessioncontracts.StateTerminated: {},
|
||||
},
|
||||
}
|
||||
|
||||
func validateTransition(from, to sessioncontracts.State) error {
|
||||
if from == to {
|
||||
return nil
|
||||
}
|
||||
if allowed, ok := allowedTransitions[from]; ok {
|
||||
if _, ok := allowed[to]; ok {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("invalid session state transition: %s -> %s", from, to)
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,29 @@
|
||||
package worker
|
||||
|
||||
type SessionEvent struct {
|
||||
Type string `json:"type"`
|
||||
SessionID string `json:"session_id"`
|
||||
WorkerID string `json:"worker_id"`
|
||||
Payload map[string]any `json:"payload,omitempty"`
|
||||
}
|
||||
|
||||
const (
|
||||
SessionEventConnected = "session_connected"
|
||||
SessionEventHeartbeat = "session_heartbeat"
|
||||
SessionEventFailed = "session_failed"
|
||||
SessionEventTerminated = "session_terminated"
|
||||
SessionEventDisplayReady = "session_display_ready"
|
||||
SessionEventRenderReady = "session_render_ready"
|
||||
SessionEventRenderDirty = "session_render_dirty"
|
||||
SessionEventRenderResized = "session_render_resized"
|
||||
SessionEventCursorUpdated = "session_cursor_updated"
|
||||
SessionEventFrame = "session_frame"
|
||||
SessionEventClipboardText = "session_clipboard_text"
|
||||
SessionEventFileUploaded = "session_file_upload_completed"
|
||||
SessionEventFileDownloadAvailable = "session_file_download_available"
|
||||
SessionEventFileDownloadChunk = "session_file_download_chunk"
|
||||
SessionEventFileDownloadProgress = "session_file_download_progress"
|
||||
SessionEventFileDownloadCompleted = "session_file_download_completed"
|
||||
SessionEventFileDownloadFailed = "session_file_download_failed"
|
||||
SessionEventFileDownloadBlocked = "session_file_download_blocked"
|
||||
)
|
||||
@@ -0,0 +1,52 @@
|
||||
package worker
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"time"
|
||||
|
||||
"github.com/example/remote-access-platform/backend/internal/modules/sessionbroker"
|
||||
)
|
||||
|
||||
type LeaseMonitor struct {
|
||||
service *Service
|
||||
broker *sessionbroker.Service
|
||||
interval time.Duration
|
||||
}
|
||||
|
||||
func NewLeaseMonitor(service *Service, broker *sessionbroker.Service, interval time.Duration) *LeaseMonitor {
|
||||
if interval <= 0 {
|
||||
interval = 15 * time.Second
|
||||
}
|
||||
return &LeaseMonitor{
|
||||
service: service,
|
||||
broker: broker,
|
||||
interval: interval,
|
||||
}
|
||||
}
|
||||
|
||||
func (m *LeaseMonitor) Run(ctx context.Context) error {
|
||||
ticker := time.NewTicker(m.interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil
|
||||
case <-ticker.C:
|
||||
stale, err := m.service.RecoverStaleLeases(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, lease := range stale {
|
||||
err := m.broker.MarkSessionFailed(ctx, sessionbroker.MarkSessionFailedCommand{
|
||||
SessionID: lease.SessionID,
|
||||
Reason: "worker_lease_stale_or_worker_missing",
|
||||
})
|
||||
if err != nil && !errors.Is(err, sessionbroker.ErrSessionNotFound) && !errors.Is(err, sessionbroker.ErrSessionNotTerminable) {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user