From 4ebc6629e62474ec5ca0f4ba54f14c29da4391db Mon Sep 17 00:00:00 2001 From: Mikhail Date: Sat, 16 May 2026 10:51:06 +0300 Subject: [PATCH] Pin QUIC fabric endpoint certificates --- .../rap-node-agent/cmd/rap-node-agent/main.go | 66 ++++++++++++++---- .../cmd/rap-node-agent/main_test.go | 3 +- .../internal/mesh/fabric_quic_transport.go | 33 ++++++++- .../mesh/fabric_quic_transport_test.go | 69 ++++++++++++++++++- .../internal/mesh/fabric_transport.go | 1 + .../DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md | 3 + 6 files changed, 157 insertions(+), 18 deletions(-) diff --git a/agents/rap-node-agent/cmd/rap-node-agent/main.go b/agents/rap-node-agent/cmd/rap-node-agent/main.go index 4c2db34..9af9e27 100644 --- a/agents/rap-node-agent/cmd/rap-node-agent/main.go +++ b/agents/rap-node-agent/cmd/rap-node-agent/main.go @@ -4,9 +4,11 @@ import ( "context" "crypto/rand" "crypto/rsa" + "crypto/sha256" "crypto/tls" "crypto/x509" "crypto/x509/pkix" + "encoding/hex" "encoding/json" "fmt" "log" @@ -373,6 +375,7 @@ type syntheticMeshState struct { StopListener func() QUICFabricServer *mesh.QUICFabricServer QUICFabricListenAddr string + QUICFabricCertSHA256 string QUICFabricError string ConfigLoadError string } @@ -788,7 +791,7 @@ func startSyntheticMeshEndpoint(ctx context.Context, _ context.CancelFunc, cfg c listenerCfg := meshListenerRuntimeConfig(cfg, loadedConfig.MeshListener) listenerReport, stopListener := startSyntheticMeshHTTPServer(ctx, listenerCfg, identity, dynamicListenerHandler, len(peerEndpoints), len(routes), gateEnabled, runtimeEnabled) vpnFabricSessionPeers := mesh.NewFabricSessionPeerManager() - quicFabricServer, quicFabricAddr, quicFabricErr := startQUICFabricEndpoint(ctx, cfg, identity) + quicFabricServer, quicFabricAddr, quicFabricCertSHA256, quicFabricErr := startQUICFabricEndpoint(ctx, cfg, identity) return &syntheticMeshState{ Runtime: runtime, Routes: routes, @@ -827,6 +830,7 @@ func startSyntheticMeshEndpoint(ctx context.Context, _ context.CancelFunc, cfg c StopListener: stopListener, QUICFabricServer: quicFabricServer, QUICFabricListenAddr: quicFabricAddr, + QUICFabricCertSHA256: quicFabricCertSHA256, QUICFabricError: errorString(quicFabricErr), ConfigLoadError: errorString(err), }, stopListener, nil @@ -1166,16 +1170,16 @@ func bindSyntheticMeshListener(cfg config.Config) (net.Listener, string, bool, e return nil, "", false, err } -func startQUICFabricEndpoint(ctx context.Context, cfg config.Config, identity state.Identity) (*mesh.QUICFabricServer, string, error) { +func startQUICFabricEndpoint(ctx context.Context, cfg config.Config, identity state.Identity) (*mesh.QUICFabricServer, string, string, error) { if !cfg.MeshQUICFabricEnabled { - return nil, "", nil + return nil, "", "", nil } if strings.TrimSpace(cfg.MeshQUICFabricListenAddr) == "" { - return nil, "", fmt.Errorf("quic fabric enabled but listen addr is empty") + return nil, "", "", fmt.Errorf("quic fabric enabled but listen addr is empty") } - tlsConfig, err := quicFabricTLSConfig(identity) + tlsConfig, certSHA256, err := quicFabricTLSConfig(identity) if err != nil { - return nil, "", err + return nil, "", "", err } server, err := mesh.StartQUICFabricServer(ctx, mesh.QUICFabricServerConfig{ ListenAddr: cfg.MeshQUICFabricListenAddr, @@ -1190,20 +1194,20 @@ func startQUICFabricEndpoint(ctx context.Context, cfg config.Config, identity st }, }) if err != nil { - return nil, "", err + return nil, "", "", err } addr := "" if server.Addr() != nil { addr = server.Addr().String() } log.Printf("quic fabric endpoint enabled: listen_addr=%s effective_addr=%s node_id=%s cluster_id=%s", cfg.MeshQUICFabricListenAddr, addr, identity.NodeID, identity.ClusterID) - return server, addr, nil + return server, addr, certSHA256, nil } -func quicFabricTLSConfig(identity state.Identity) (*tls.Config, error) { +func quicFabricTLSConfig(identity state.Identity) (*tls.Config, string, error) { key, err := rsa.GenerateKey(rand.Reader, 2048) if err != nil { - return nil, err + return nil, "", err } commonName := firstNonEmpty(identity.NodeID, "rap-fabric-node") template := x509.Certificate{ @@ -1217,15 +1221,16 @@ func quicFabricTLSConfig(identity state.Identity) (*tls.Config, error) { } certDER, err := x509.CreateCertificate(rand.Reader, &template, &template, &key.PublicKey, key) if err != nil { - return nil, err + return nil, "", err } + sum := sha256.Sum256(certDER) return &tls.Config{ Certificates: []tls.Certificate{{ Certificate: [][]byte{certDER}, PrivateKey: key, }}, NextProtos: []string{"rap-fabric-data-session-v1"}, - }, nil + }, hex.EncodeToString(sum[:]), nil } func isAddressInUse(err error) bool { @@ -1825,17 +1830,20 @@ func applyQUICFabricConfigIfChanged(ctx context.Context, cfg config.Config, iden _ = meshState.QUICFabricServer.Close() meshState.QUICFabricServer = nil meshState.QUICFabricListenAddr = "" + meshState.QUICFabricCertSHA256 = "" } if !cfg.MeshQUICFabricEnabled { meshState.QUICFabricError = "" + meshState.QUICFabricCertSHA256 = "" return } if meshState.QUICFabricServer != nil { return } - server, addr, err := startQUICFabricEndpoint(ctx, cfg, identity) + server, addr, certSHA256, err := startQUICFabricEndpoint(ctx, cfg, identity) meshState.QUICFabricServer = server meshState.QUICFabricListenAddr = addr + meshState.QUICFabricCertSHA256 = certSHA256 meshState.QUICFabricError = errorString(err) if err != nil { log.Printf("quic fabric endpoint unavailable: listen_addr=%s node_id=%s cluster_id=%s err=%v", cfg.MeshQUICFabricListenAddr, identity.NodeID, identity.ClusterID, err) @@ -2610,6 +2618,7 @@ func heartbeatPayload(cfg config.Config, identity state.Identity, meshState *syn "enabled": meshState.QUICFabricServer != nil, "listen_addr": cfg.MeshQUICFabricListenAddr, "effective_listen_addr": meshState.QUICFabricListenAddr, + "tls_cert_sha256": meshState.QUICFabricCertSHA256, "error": meshState.QUICFabricError, } } @@ -3860,6 +3869,7 @@ func advertisedEndpointCandidates(cfg config.Config, identity state.Identity, me Region: cfg.MeshRegion, Priority: 5, PolicyTags: []string{"fast-path"}, + Metadata: quicFabricEndpointMetadata(meshState.QUICFabricCertSHA256), }) } candidates = append(candidates, interfaceEndpointCandidates(cfg, identity, meshState, observedAt)...) @@ -3916,6 +3926,18 @@ func advertisedEndpointCandidates(cfg config.Config, identity state.Identity, me return candidates, nil } +func quicFabricEndpointMetadata(certSHA256 string) json.RawMessage { + certSHA256 = strings.TrimSpace(certSHA256) + if certSHA256 == "" { + return nil + } + payload, err := json.Marshal(map[string]string{"tls_cert_sha256": certSHA256}) + if err != nil { + return nil + } + return payload +} + func interfaceEndpointCandidates(cfg config.Config, identity state.Identity, meshState *syntheticMeshState, observedAt time.Time) []mesh.PeerEndpointCandidate { if meshState == nil { return nil @@ -4675,8 +4697,9 @@ func vpnFabricSessionTarget(meshState *syntheticMeshState, nextHop string) (mesh continue } return mesh.FabricTransportTarget{ - Endpoint: endpoint, - Transport: item.Candidate.Transport, + Endpoint: endpoint, + Transport: item.Candidate.Transport, + PeerCertSHA256: endpointCandidateTLSCertSHA256(item.Candidate), }, true } } @@ -4687,6 +4710,19 @@ func vpnFabricSessionTarget(meshState *syntheticMeshState, nextHop string) (mesh return mesh.FabricTransportTarget{Endpoint: endpoint}, true } +func endpointCandidateTLSCertSHA256(candidate mesh.PeerEndpointCandidate) string { + if len(candidate.Metadata) == 0 { + return "" + } + var metadata struct { + TLSCertSHA256 string `json:"tls_cert_sha256"` + } + if err := json.Unmarshal(candidate.Metadata, &metadata); err != nil { + return "" + } + return strings.TrimSpace(metadata.TLSCertSHA256) +} + func fabricSessionGatewayToken(identity state.Identity, assignment client.NodeVPNAssignment, nextHop string) string { tokenParts := []string{ "rap_fsn_vpn", diff --git a/agents/rap-node-agent/cmd/rap-node-agent/main_test.go b/agents/rap-node-agent/cmd/rap-node-agent/main_test.go index d3c6293..d4bfa00 100644 --- a/agents/rap-node-agent/cmd/rap-node-agent/main_test.go +++ b/agents/rap-node-agent/cmd/rap-node-agent/main_test.go @@ -782,6 +782,7 @@ func TestVPNFabricSessionTargetPrefersRankedQUICCandidate(t *testing.T) { ConnectivityMode: "direct", Priority: 10, LastVerifiedAt: &now, + Metadata: json.RawMessage(`{"tls_cert_sha256":"abcdef"}`), }, }, }, @@ -789,7 +790,7 @@ func TestVPNFabricSessionTargetPrefersRankedQUICCandidate(t *testing.T) { if !ok { t.Fatal("target missing") } - if target.Endpoint != "quic://node-b.example.test:19443" || target.Transport != "direct_quic" { + if target.Endpoint != "quic://node-b.example.test:19443" || target.Transport != "direct_quic" || target.PeerCertSHA256 != "abcdef" { t.Fatalf("target = %+v, want direct quic candidate", target) } } diff --git a/agents/rap-node-agent/internal/mesh/fabric_quic_transport.go b/agents/rap-node-agent/internal/mesh/fabric_quic_transport.go index 96a30f5..40cf860 100644 --- a/agents/rap-node-agent/internal/mesh/fabric_quic_transport.go +++ b/agents/rap-node-agent/internal/mesh/fabric_quic_transport.go @@ -2,7 +2,10 @@ package mesh import ( "context" + "crypto/sha256" "crypto/tls" + "crypto/x509" + "encoding/hex" "fmt" "strings" "sync" @@ -34,6 +37,34 @@ func NewQUICFabricTransport(config *quic.Config) *QUICFabricTransport { return &QUICFabricTransport{Config: config} } +func quicTLSConfigForTarget(target FabricTransportTarget) *tls.Config { + expectedFingerprint := normalizeCertSHA256(target.PeerCertSHA256) + config := &tls.Config{NextProtos: []string{fabricQUICNextProto}} + if expectedFingerprint == "" { + return config + } + config.InsecureSkipVerify = true + config.VerifyPeerCertificate = func(rawCerts [][]byte, _ [][]*x509.Certificate) error { + if len(rawCerts) == 0 { + return fmt.Errorf("quic peer certificate missing") + } + sum := sha256.Sum256(rawCerts[0]) + actual := hex.EncodeToString(sum[:]) + if actual != expectedFingerprint { + return fmt.Errorf("quic peer certificate fingerprint mismatch") + } + return nil + } + return config +} + +func normalizeCertSHA256(value string) string { + value = strings.ToLower(strings.TrimSpace(value)) + value = strings.ReplaceAll(value, "sha256:", "") + value = strings.ReplaceAll(value, ":", "") + return value +} + func (t *QUICFabricTransport) Connect(ctx context.Context, target FabricTransportTarget) (FabricTransportSession, error) { if target.Endpoint == "" { return nil, fmt.Errorf("quic fabric endpoint is required") @@ -41,7 +72,7 @@ func (t *QUICFabricTransport) Connect(ctx context.Context, target FabricTranspor target.Endpoint = strings.TrimPrefix(strings.TrimSpace(target.Endpoint), "quic://") tlsConfig := target.TLSConfig if tlsConfig == nil { - tlsConfig = &tls.Config{NextProtos: []string{fabricQUICNextProto}} + tlsConfig = quicTLSConfigForTarget(target) } else { tlsConfig = tlsConfig.Clone() if len(tlsConfig.NextProtos) == 0 { diff --git a/agents/rap-node-agent/internal/mesh/fabric_quic_transport_test.go b/agents/rap-node-agent/internal/mesh/fabric_quic_transport_test.go index 7d40513..d765487 100644 --- a/agents/rap-node-agent/internal/mesh/fabric_quic_transport_test.go +++ b/agents/rap-node-agent/internal/mesh/fabric_quic_transport_test.go @@ -4,11 +4,14 @@ import ( "context" "crypto/rand" "crypto/rsa" + "crypto/sha256" "crypto/tls" "crypto/x509" "crypto/x509/pkix" + "encoding/hex" "encoding/pem" "math/big" + "strings" "testing" "time" @@ -102,6 +105,56 @@ func TestQUICFabricTransportDataAck(t *testing.T) { } } +func TestQUICFabricTransportVerifiesPinnedCertificate(t *testing.T) { + tlsConfig := testQUICTLSConfig(t) + listener := startQUICFabricEchoServerWithTLS(t, tlsConfig) + defer listener.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + session, err := NewQUICFabricTransport(nil).Connect(ctx, FabricTransportTarget{ + Endpoint: listener.Addr().String(), + PeerCertSHA256: testQUICCertSHA256(t, tlsConfig), + Timeout: time.Second, + InboundBuffer: 4, + ErrorBuffer: 4, + }) + if err != nil { + t.Fatalf("connect quic fabric with pinned certificate: %v", err) + } + defer session.Close() + + if err := session.Send(ctx, fabricproto.Frame{Type: fabricproto.FramePing, Sequence: 43, Payload: []byte("pin")}); err != nil { + t.Fatalf("send ping: %v", err) + } + select { + case frame := <-session.Frames(): + if frame.Type != fabricproto.FramePong || frame.Sequence != 43 || string(frame.Payload) != "pin" { + t.Fatalf("frame = %+v", frame) + } + case err := <-session.Errors(): + t.Fatalf("session error: %v", err) + case <-ctx.Done(): + t.Fatal(ctx.Err()) + } +} + +func TestQUICFabricTransportRejectsPinnedCertificateMismatch(t *testing.T) { + listener := startQUICFabricEchoServer(t) + defer listener.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + _, err := NewQUICFabricTransport(nil).Connect(ctx, FabricTransportTarget{ + Endpoint: listener.Addr().String(), + PeerCertSHA256: strings.Repeat("0", 64), + Timeout: time.Second, + }) + if err == nil { + t.Fatal("connect succeeded with mismatched certificate pin") + } +} + func TestQUICFabricServerHandlesFabricFrames(t *testing.T) { var events []FabricSessionEventLogEntry server, err := StartQUICFabricServer(context.Background(), QUICFabricServerConfig{ @@ -152,7 +205,12 @@ func TestQUICFabricServerHandlesFabricFrames(t *testing.T) { func startQUICFabricEchoServer(t *testing.T) *quic.Listener { t.Helper() - listener, err := quic.ListenAddr("127.0.0.1:0", testQUICTLSConfig(t), &quic.Config{EnableDatagrams: true}) + return startQUICFabricEchoServerWithTLS(t, testQUICTLSConfig(t)) +} + +func startQUICFabricEchoServerWithTLS(t *testing.T, tlsConfig *tls.Config) *quic.Listener { + t.Helper() + listener, err := quic.ListenAddr("127.0.0.1:0", tlsConfig, &quic.Config{EnableDatagrams: true}) if err != nil { t.Fatalf("listen quic: %v", err) } @@ -189,6 +247,15 @@ func startQUICFabricEchoServer(t *testing.T) *quic.Listener { return listener } +func testQUICCertSHA256(t *testing.T, tlsConfig *tls.Config) string { + t.Helper() + if len(tlsConfig.Certificates) == 0 || len(tlsConfig.Certificates[0].Certificate) == 0 { + t.Fatal("test tls config has no certificate") + } + sum := sha256.Sum256(tlsConfig.Certificates[0].Certificate[0]) + return hex.EncodeToString(sum[:]) +} + func testQUICTLSConfig(t *testing.T) *tls.Config { t.Helper() key, err := rsa.GenerateKey(rand.Reader, 2048) diff --git a/agents/rap-node-agent/internal/mesh/fabric_transport.go b/agents/rap-node-agent/internal/mesh/fabric_transport.go index 7791a31..d2fa56b 100644 --- a/agents/rap-node-agent/internal/mesh/fabric_transport.go +++ b/agents/rap-node-agent/internal/mesh/fabric_transport.go @@ -31,6 +31,7 @@ type FabricTransportTarget struct { Token string Header http.Header TLSConfig *tls.Config + PeerCertSHA256 string Timeout time.Duration MaxPayload int OutboundBuffer int diff --git a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md index 5b812c1..6b1725a 100644 --- a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md +++ b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md @@ -313,6 +313,9 @@ compatibility candidates for fabric sessions. VPN fabric-session gateway transport now consumes ranked endpoint candidates, so dataplane sessions can select QUIC fast-path candidates and fall back to legacy peer endpoints when the control plane has not published candidates yet. +The temporary self-signed QUIC listener advertises its SHA-256 certificate +fingerprint in endpoint metadata, and the QUIC client can pin that fingerprint +instead of disabling verification while the cluster CA path is being finished. Deliverables: