diff --git a/api/v1alpha1/seinode_types.go b/api/v1alpha1/seinode_types.go index 01f365a1..ded15077 100644 --- a/api/v1alpha1/seinode_types.go +++ b/api/v1alpha1/seinode_types.go @@ -290,24 +290,28 @@ const ( // ConditionSeiNodePaused mirrors spec.paused: True when paused. ConditionSeiNodePaused = "Paused" - // ConditionStateSyncReady gates the state-sync-bearing plan. Always-present - // once reconciled. True means canonical syncers are configured and the plan - // may proceed; False fails closed (no state-sync plan built, and peers are - // never used as witnesses). It is a configured-count gate: witness - // reliability comes from curating the canonical-syncer set, and the sidecar - // establishes the trust point from them as it does today. + // ConditionStateSyncReady gates the ConfigureStateSync-bearing plan, which is + // built for any snapshot bootstrap (stateSync or s3 — both apply via CometBFT + // state-sync and need rpc-server witnesses). Always-present once reconciled. + // True means canonical syncers are configured and the plan may proceed; False + // fails closed (no such plan built, and peers are never used as witnesses). It + // is a configured-count gate: witness reliability comes from curating the + // canonical-syncer set, and the sidecar establishes the trust point from them + // as it does today. ConditionStateSyncReady = "StateSyncReady" ) // Reasons for the StateSyncReady condition. const ( - // ReasonStateSyncReady: state-sync enabled and >=2 canonical syncers are - // configured for the chain; the state-sync-bearing plan may proceed. + // ReasonStateSyncReady: the node bootstraps from a snapshot (stateSync or s3) + // and >=2 canonical syncers are configured for the chain; the + // ConfigureStateSync-bearing plan may proceed. ReasonStateSyncReady = "Ready" - // ReasonStateSyncNoSyncersConfigured: state-sync enabled but the canonical- - // syncer source yields <2 entries for the chain (fail closed). + // ReasonStateSyncNoSyncersConfigured: the node bootstraps from a snapshot but + // the canonical-syncer source yields <2 entries for the chain (fail closed). ReasonStateSyncNoSyncersConfigured = "NoSyncersConfigured" - // ReasonStateSyncNotApplicable: the node does not enable state-sync. + // ReasonStateSyncNotApplicable: the node does not bootstrap from a snapshot + // (e.g. a genesis node), so it carries no ConfigureStateSync task to gate. ReasonStateSyncNotApplicable = "NotApplicable" // ReasonStateSyncSyncerSourceError: reading or parsing the canonical-syncer // source file failed for a reason other than absence (transient). Fails diff --git a/internal/controller/node/plan_execution_test.go b/internal/controller/node/plan_execution_test.go index 0d049562..196c02da 100644 --- a/internal/controller/node/plan_execution_test.go +++ b/internal/controller/node/plan_execution_test.go @@ -147,6 +147,9 @@ func newProgressionReconciler(t *testing.T, mock *mockSidecarClient, objs ...cli }, }, } + // Snapshot-bootstrap nodes pass through the fail-closed StateSyncReady gate; + // wire syncers for every fixture chain so progression tests proceed. + withSyncers(t, r, fixtureSyncers()) return r, c } diff --git a/internal/controller/node/reconciler_test.go b/internal/controller/node/reconciler_test.go index 2653a0b3..92d958e3 100644 --- a/internal/controller/node/reconciler_test.go +++ b/internal/controller/node/reconciler_test.go @@ -70,9 +70,26 @@ func newNodeReconcilerWithSidecar(t *testing.T, mock *mockSidecarClient, objs .. }, }, } + // Wire canonical syncers for every fixture chain so snapshot-bootstrap nodes + // (s3 and stateSync both carry ConfigureStateSync and pass through the + // fail-closed StateSyncReady gate). Tests asserting fail-closed override this + // via withSyncers/writeSyncerFile. + withSyncers(t, r, fixtureSyncers()) return r, c } +// fixtureSyncers returns a >=2-syncer set for every chain used by node test +// fixtures, so a snapshot-bootstrap node clears the StateSyncReady gate +// regardless of which fixture chain it carries. +func fixtureSyncers() map[string][]string { + chains := []string{defaultTestChainID, testChainID, atlantic2ChainID, pacific1ChainID, "test"} + out := make(map[string][]string, len(chains)) + for _, ch := range chains { + out[ch] = []string{syncerA, syncerB} + } + return out +} + func nodeReqFor(name, namespace string) ctrl.Request { //nolint:unparam // test helper designed for reuse return ctrl.Request{NamespacedName: types.NamespacedName{Name: name, Namespace: namespace}} } @@ -89,6 +106,11 @@ func getSeiNode(t *testing.T, ctx context.Context, c client.Client, name, namesp const ( testImageV2 = "ghcr.io/sei-protocol/seid:v2.0.0" testRevision = "rev-2" + // defaultTestChainID must match the chainID the snapshot/genesis fixtures + // hardcode in testhelpers_test.go (not enforced by the compiler). + defaultTestChainID = "sei-test" + atlantic2ChainID = "atlantic-2" + pacific1ChainID = "pacific-1" ) func TestNodeReconcile_NotFound(t *testing.T) { diff --git a/internal/controller/node/statesync.go b/internal/controller/node/statesync.go index a51cdfe6..4e4b86c1 100644 --- a/internal/controller/node/statesync.go +++ b/internal/controller/node/statesync.go @@ -25,6 +25,9 @@ const minCanonicalSyncers = 2 // the caller's single status patch flushes it. Run before the Failed/Paused // early-returns so StateSyncReady is seeded on every path. // +// It resolves for any snapshot-bootstrap node (see planner.needsStateSyncWitnesses +// for why witnesses are required); a genesis node (snap == nil) is NotApplicable. +// // Fail-closed is enforced downstream: the planner declines to build a state-sync // plan whenever StateSyncReady is not True (see ResolvePlan), so this method only // resolves the condition. It returns blocked=true whenever state-sync is enabled @@ -34,11 +37,12 @@ const minCanonicalSyncers = 2 // and unblocks once GitOps provisions or fixes the syncers. func (r *SeiNodeReconciler) reconcileStateSyncGate(node *seiv1alpha1.SeiNode) (blocked bool) { snap := node.Spec.SnapshotSource() - if snap == nil || snap.StateSync == nil { - // State-sync disabled: no state-sync task in the plan to gate. + if snap == nil { + // No snapshot source (e.g. genesis validator): no ConfigureStateSync task + // in the plan to gate. node.Status.ResolvedStateSyncers = nil setStateSyncReady(node, metav1.ConditionFalse, seiv1alpha1.ReasonStateSyncNotApplicable, - "node does not enable state sync") + "node does not bootstrap from a snapshot") return false } diff --git a/internal/controller/node/statesync_test.go b/internal/controller/node/statesync_test.go index 72c1de9c..8e02162a 100644 --- a/internal/controller/node/statesync_test.go +++ b/internal/controller/node/statesync_test.go @@ -196,32 +196,56 @@ func TestStateSyncGate_UnconfiguredSource_FailsClosed(t *testing.T) { g := NewWithT(t) node := stateSyncNode("n", testChainID) - r, _ := newNodeReconciler(t, node) // Platform leaves ControllerConfigFile empty. + r, _ := newNodeReconciler(t, node) + r.Platform.ControllerConfigFile = "" // unconfigured source blocked := r.reconcileStateSyncGate(node) g.Expect(blocked).To(BeTrue()) g.Expect(stateSyncCondition(node).Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured)) } -func TestStateSyncGate_Disabled_NotApplicable(t *testing.T) { +// An s3-restore node applies its snapshot via CometBFT state-sync +// (use-local-snapshot), so it ALSO needs canonical rpc-server witnesses to +// verify the trust point. The gate must resolve syncers for it (Ready when >=2 +// are configured) rather than declaring NotApplicable — the latter left +// ResolvedStateSyncers nil and the sidecar fell back to unreachable peers. +func TestStateSyncGate_S3Restore_ResolvesSyncers(t *testing.T) { g := NewWithT(t) - // S3 snapshot node: state sync not enabled. - node := newSnapshotNode("n", testNamespace) + node := newSnapshotNode("n", testNamespace) // s3 source, ChainID "sei-test" r, _ := newNodeReconciler(t, node) - withSyncers(t, r, map[string][]string{testChainID: {syncerA, syncerB}}) + withSyncers(t, r, map[string][]string{node.Spec.ChainID: {syncerA, syncerB}}) blocked := r.reconcileStateSyncGate(node) g.Expect(blocked).To(BeFalse()) + cond := stateSyncCondition(node) + g.Expect(cond.Status).To(Equal(metav1.ConditionTrue)) + g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncReady)) + g.Expect(node.Status.ResolvedStateSyncers).To(Equal([]string{syncerA, syncerB})) +} + +// An s3-restore node with <2 configured syncers must fail closed — same as a +// stateSync node. This is the bug being fixed: previously it was NotApplicable +// and proceeded witness-less. +func TestStateSyncGate_S3Restore_OneSyncer_FailsClosed(t *testing.T) { + g := NewWithT(t) + + node := newSnapshotNode("n", testNamespace) + r, _ := newNodeReconciler(t, node) + withSyncers(t, r, map[string][]string{node.Spec.ChainID: {syncerSingle}}) + + blocked := r.reconcileStateSyncGate(node) + g.Expect(blocked).To(BeTrue()) + cond := stateSyncCondition(node) g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) - g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncNotApplicable)) + g.Expect(cond.Reason).To(Equal(seiv1alpha1.ReasonStateSyncNoSyncersConfigured)) g.Expect(node.Status.ResolvedStateSyncers).To(BeNil()) } -// A node with no snapshot source at all (e.g. genesis validator) is treated as -// state-sync-disabled: NotApplicable, never blocks the plan. +// A node with no snapshot source at all (e.g. genesis validator) carries no +// ConfigureStateSync task: NotApplicable, never blocks the plan. func TestStateSyncGate_NoSnapshotSource_NotApplicable(t *testing.T) { g := NewWithT(t) @@ -248,19 +272,21 @@ func TestStateSyncGate_FailClosedClearsStaleSyncers(t *testing.T) { node := stateSyncNode("n", testChainID) node.Status.ResolvedStateSyncers = []string{"old-a:26657", "old-b:26657"} - r, _ := newNodeReconciler(t, node) // unconfigured source → fail closed + r, _ := newNodeReconciler(t, node) + r.Platform.ControllerConfigFile = "" // unconfigured source → fail closed blocked := r.reconcileStateSyncGate(node) g.Expect(blocked).To(BeTrue()) g.Expect(node.Status.ResolvedStateSyncers).To(BeNil()) } -// The gate must not block (or even set ResolvedStateSyncers on) a non-state-sync -// node — regression guard for the full reconcile path. -func TestStateSyncGate_NonStateSyncNodeUnaffected(t *testing.T) { +// The gate must not block (or even set ResolvedStateSyncers on) a node that +// carries no ConfigureStateSync task — a genesis node with no snapshot source. +// Regression guard for the full reconcile path. +func TestStateSyncGate_GenesisNodeUnaffected(t *testing.T) { g := NewWithT(t) - node := newSnapshotNode("n", testNamespace) + node := newGenesisNode("n", testNamespace) r, _ := newNodeReconciler(t, node) blocked := r.reconcileStateSyncGate(node) @@ -365,8 +391,8 @@ func TestReconcile_PausedNode_StateSyncReadyStillSeeded(t *testing.T) { wantReason: seiv1alpha1.ReasonStateSyncNoSyncersConfigured, }, { - name: "state-sync disabled", - node: newSnapshotNode("paused-s3", testNamespace), + name: "no snapshot source", + node: newGenesisNode("paused-gen", testNamespace), withSyncers: false, wantReason: seiv1alpha1.ReasonStateSyncNotApplicable, }, diff --git a/internal/planner/doc.go b/internal/planner/doc.go index 531fe3b5..72081a90 100644 --- a/internal/planner/doc.go +++ b/internal/planner/doc.go @@ -94,6 +94,13 @@ // - Conditions are always-present: condition state is expressed as // True/False/Unknown with a stable Reason, never by removal (see // CLAUDE.md "Conditions"). isConditionTrue asserts on Status, not presence. +// - State-sync witnesses gate every snapshot bootstrap: needsStateSyncWitnesses +// (snap != nil) drives both the ConfigureStateSync task insertion +// (buildSidecarProgression) and the fail-closed plan blocker +// (stateSyncBlocksPlan), so a node bootstrapping from a snapshot — s3 or +// stateSync — can never plan ConfigureStateSync without >=2 resolved +// canonical-syncer witnesses. Genesis (snap == nil) carries no such task. +// Guarded by TestStateSyncGate_S3Restore_OneSyncer_FailsClosed. // // # Zero-Value & Sentinel Semantics // diff --git a/internal/planner/planner.go b/internal/planner/planner.go index 13b6c392..d1f32ba8 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -186,17 +186,19 @@ func (p *NodeResolver) ResolvePlan(ctx context.Context, node *seiv1alpha1.SeiNod // stateSyncBlocksPlan reports whether the fail-closed state-sync gate must // suppress plan construction this reconcile. It fires only on the init path -// (pre-Running), which is the only path that builds a state-sync-bearing plan -// via buildSidecarProgression — a Running node's update plans carry no +// (pre-Running), which is the only path that builds a ConfigureStateSync-bearing +// plan via buildSidecarProgression — a Running node's update plans carry no // state-sync task, so an image roll must not be blocked by a syncer-source -// blip. The gate trips when state-sync is enabled and the controller-resolved -// StateSyncReady condition is not True (NoSyncersConfigured or the transient -// SyncerSourceError). A missing condition (not yet resolved) does not gate. +// blip. The gate trips when the node carries a ConfigureStateSync task (any +// snapshot source — both stateSync and s3-restore apply via CometBFT state-sync +// and need rpc-server witnesses) and the controller-resolved StateSyncReady +// condition is not True (NoSyncersConfigured or the transient SyncerSourceError). +// A missing condition (not yet resolved) does not gate. func stateSyncBlocksPlan(node *seiv1alpha1.SeiNode) bool { if node.Status.Phase == seiv1alpha1.PhaseRunning { return false } - if !hasStateSync(node.Spec.SnapshotSource()) { + if !needsStateSyncWitnesses(node.Spec.SnapshotSource()) { return false } cond := meta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionStateSyncReady) @@ -342,7 +344,7 @@ func buildSidecarProgression(snap *seiv1alpha1.SnapshotSource) ([]string, error) if prog, err = insertBefore(prog, TaskConfigApply, TaskConfigureGenesis); err != nil { return nil, err } - if snap != nil { + if needsStateSyncWitnesses(snap) { if prog, err = insertBefore(prog, TaskConfigValidate, TaskConfigureStateSync); err != nil { return nil, err } @@ -477,6 +479,21 @@ func hasStateSync(snap *seiv1alpha1.SnapshotSource) bool { return snap != nil && snap.StateSync != nil } +// needsStateSyncWitnesses reports whether the node's plan carries a +// ConfigureStateSync task and therefore requires canonical rpc-server witnesses. +// Both bootstrap-from-snapshot modes apply the snapshot through CometBFT +// state-sync — stateSync fetches chunks from peers; s3 stages a snapshot under +// data/snapshots and applies it with use-local-snapshot=true — and both verify +// the trust point against >=2 reachable rpc-servers. A genesis node +// (snap == nil) carries no such task and needs no witnesses. +// +// This is the single predicate behind both the ConfigureStateSync insertion +// (buildSidecarProgression) and the fail-closed gate (stateSyncBlocksPlan), +// keeping plan-insertion and witness-resolution in lockstep. +func needsStateSyncWitnesses(snap *seiv1alpha1.SnapshotSource) bool { + return snap != nil +} + func bootstrapMode(snap *seiv1alpha1.SnapshotSource) string { if hasS3Snapshot(snap) { return "snapshot"