From 2f4bf7c3e6a78c9233ed2c4ce64362189f3b46f4 Mon Sep 17 00:00:00 2001 From: Wen Date: Fri, 15 May 2026 14:26:40 -0700 Subject: [PATCH 1/7] data: Store interface for persistent block + FullCommitQC backing (CON-272) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Interface only — no implementation, no consumer rewiring. Captures the contract we want from a database that will replace the file-WAL-based DataWAL.Blocks + DataWAL.CommitQCs and add a by-hash block index. Surface: - WriteBlock(n, *types.Block) - WriteQC(*types.FullCommitQC) // qc carries its GlobalRange - PruneBefore(n) - Flush - ReadAll() → Loaded{Blocks, QCs} - ReadBlockByNumber(n) - ReadBlockByHash(hash) - ReadQCByBlockNumber(n) // QC covering block n - Close Godocs spell out: - Concurrency: all methods safe for concurrent use - Crash safety: each Write is atomic; cross-write atomicity is the caller's problem (DataWAL.reconcile-style) - Read-your-writes within a session - Contiguity guarantee for QC writes (caller-guaranteed, not implementation-enforced) - Why n is required on WriteBlock (Block does not carry GlobalBlockNumber; only per-lane BlockNumber) - Why no separate hash arg on WriteBlock (derivable via block.Header().Hash()) - Read methods are non-blocking; "not yet written" reports as (nil, false, nil) — wait semantics live above the interface Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/autobahn/data/store.go | 198 ++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 sei-tendermint/internal/autobahn/data/store.go diff --git a/sei-tendermint/internal/autobahn/data/store.go b/sei-tendermint/internal/autobahn/data/store.go new file mode 100644 index 0000000000..f88d9148dc --- /dev/null +++ b/sei-tendermint/internal/autobahn/data/store.go @@ -0,0 +1,198 @@ +package data + +import ( + "context" + + "github.com/sei-protocol/sei-chain/sei-tendermint/internal/autobahn/types" +) + +// Store is the durable backing store for data.State. It persists the two +// kinds of finalized records the consensus state machine produces — +// finalized blocks (indexed by GlobalBlockNumber and by header hash) and +// FullCommitQCs (each covering a contiguous range of GlobalBlockNumbers) — +// and provides the read API needed for crash recovery and runtime lookups. +// +// Replaces the WAL-based DataWAL used today; the contract here is a +// superset of what DataWAL.Blocks + DataWAL.CommitQCs provide today, plus +// a by-hash block index the WAL does not have. +// +// # Concurrency +// +// All methods are safe for concurrent use. Implementations should expect +// concurrent writes (WriteBlock + WriteQC interleaved from a single +// background persistence loop) and concurrent reads from RPC handlers +// and peer-sync streams. +// +// # Crash safety +// +// Each Write* call is individually atomic: after a crash a reader will +// either see the entire write or none of it; partial writes are not +// possible. Writes are not atomic with respect to one another — a crash +// between two Write calls leaves the earlier one durable and the later +// one absent. Reconciliation of cross-record inconsistencies (e.g. a +// block written without its QC, or vice versa) is the caller's +// responsibility on startup (see DataWAL.reconcile for the rules the +// current WAL uses). +// +// Read-your-writes is provided within a single process session: a Write +// followed by a Read in the same process always observes the Write, +// even if the Write has not yet been flushed to disk. +// +// # Ordering +// +// QCs must be written contiguously — each WriteQC's +// qc.QC().GlobalRange(committee).First must equal the previous WriteQC's +// GlobalRange().Next (the caller is data.State.runPersist, which +// guarantees this). Implementations may validate and reject out-of-order +// writes but need not. +// +// Blocks may be written in any GlobalBlockNumber order; the consumer +// (data.State) writes them in ascending order today but the contract +// does not require it. +type Store interface { + // WriteBlock persists a finalized block at GlobalBlockNumber n. + // + // n is required because *types.Block does NOT carry its + // GlobalBlockNumber — block.Header().BlockNumber() returns the + // per-lane BlockNumber, a different typedef. The lane→global + // mapping lives in the QC's GlobalRange. Implementations must + // record n alongside the block so ReadBlockByNumber can recover it + // and ReadAll can reconstruct (n, *Block) pairs. + // + // The block's hash (block.Header().Hash()) is indexed automatically + // so ReadBlockByHash works after this returns — the caller does not + // supply it separately. + // + // Idempotent on duplicate: a second WriteBlock with the same + // (n, block.Header().Hash()) pair is a no-op. Writing a different + // block at an already-occupied n, or the same block under a + // different n, is a contract violation — implementations are free + // to error or to corrupt state in that case. + // + // This method may return before the write is on disk; callers that + // need crash durability must call Flush. + WriteBlock(ctx context.Context, n types.GlobalBlockNumber, block *types.Block) error + + // WriteQC persists a FullCommitQC. + // + // The QC carries its GlobalRange internally + // (qc.QC().GlobalRange(committee)) — no range argument needed. The + // caller guarantees that successive WriteQC calls form a contiguous + // sequence: each call's First equals the previous call's Next (or + // committee.FirstBlock() for the very first call). Implementations + // may reject out-of-sequence writes but need not. + // + // Idempotent on duplicate: a second WriteQC for a QC with the same + // GlobalRange().First is a no-op. + // + // This method may return before the write is on disk; callers that + // need crash durability must call Flush. + WriteQC(ctx context.Context, qc *types.FullCommitQC) error + + // PruneBefore removes: + // - every block with GlobalBlockNumber < n + // - every QC whose GlobalRange().Next ≤ n (the QC's entire + // covered range is strictly below the retention watermark; a + // QC straddling n stays) + // + // Idempotent: calling with n ≤ the existing retention watermark is + // a no-op. Pruning is permitted to be asynchronous — entries may + // remain readable briefly after PruneBefore returns, but will + // eventually become unreadable. + // + // Callers must ensure no in-flight reader is holding a pointer + // returned from a Read* call for a record being pruned. Pruning a + // record still being processed is undefined. + PruneBefore(ctx context.Context, n types.GlobalBlockNumber) error + + // Flush blocks until all Writes that returned before Flush() was + // called are durable on disk. Calls to Write made concurrently with + // Flush may or may not be durable when Flush returns (but are + // otherwise eventually durable). + // + // Flush is not required for correctness within a single process + // (read-your-writes is always provided); it exists so data.State + // can synchronize with disk for the AppHash-vote durability + // guarantee — AppVotes must only be issued for data that survives + // a crash. + Flush(ctx context.Context) error + + // ReadAll returns a snapshot of all blocks and QCs not yet pruned, + // for startup replay. Intended to be called once at construction by + // data.State.NewState; afterwards the in-memory cursors track + // everything. + // + // Blocks are returned in ascending GlobalBlockNumber order, QCs in + // ascending GlobalRange().First order. The two slices are + // independent — there is no required alignment between them + // (DataWAL.reconcile handles cross-WAL drift; the same logic will + // run over Loaded). + // + // May allocate proportional to retention. For typical Sei retention + // windows this is fine; if a future implementation expects + // orders-of-magnitude larger retention, consider switching to an + // iterator API before adopting it. + ReadAll(ctx context.Context) (*Loaded, error) + + // ReadBlockByNumber returns the block at GlobalBlockNumber n. + // + // Returns (nil, false, nil) if no block has been written at n, or + // the block at n has been pruned. Implementations must not block + // waiting for a future write — "not yet written" is reported as + // (nil, false, nil) identical to "never written". Blocking + // semantics (wait for a write at n) live above this interface, in + // data.State. + ReadBlockByNumber(ctx context.Context, n types.GlobalBlockNumber) (*types.Block, bool, error) + + // ReadBlockByHash returns the block whose header hashes to the + // given value. The hash is the same value as block.Header().Hash() + // for the block that was passed to WriteBlock. + // + // Returns (nil, false, nil) if no such block has been written, or + // it has been pruned. Like ReadBlockByNumber, this is non-blocking. + ReadBlockByHash(ctx context.Context, hash types.BlockHeaderHash) (*types.Block, bool, error) + + // ReadQCByBlockNumber returns the FullCommitQC whose + // GlobalRange().First ≤ n < GlobalRange().Next — i.e. the QC that + // finalizes the block at n. Because a single QC covers multiple + // blocks, the same *FullCommitQC is returned for every n in its + // range. + // + // Returns (nil, false, nil) if no QC has been written that covers + // n yet, or n is below the retention watermark. Non-blocking. + ReadQCByBlockNumber(ctx context.Context, n types.GlobalBlockNumber) (*types.FullCommitQC, bool, error) + + // Close releases resources held by the store. Any in-flight writes + // are flushed to disk before Close returns. After Close returns, no + // other method may be called on the Store; doing so is undefined. + Close(ctx context.Context) error +} + +// Loaded is the result of Store.ReadAll — a point-in-time view of every +// block and QC not yet pruned, used by data.State.NewState to rebuild +// in-memory state at startup. +type Loaded struct { + // Blocks is the set of persisted blocks in ascending + // GlobalBlockNumber order. Each entry pairs the block with the + // GlobalBlockNumber it was written at; the Block type does not + // carry a global number on its own — its header carries a per-lane + // BlockNumber, which is distinct. + Blocks []BlockEntry + + // QCs is the set of persisted FullCommitQCs in ascending + // GlobalRange().First order. Each QC covers a contiguous range + // [First, Next); successive entries' ranges are contiguous (no + // gaps), but the first QC's First is not required to equal + // committee.FirstBlock() — entries with GlobalRange().Next at or + // below the retention watermark have been pruned. + QCs []*types.FullCommitQC +} + +// BlockEntry pairs a finalized block with its GlobalBlockNumber. The +// GlobalBlockNumber is the index data.State uses to address the block; +// it is not stored inside *types.Block itself, so the Store records it +// alongside on WriteBlock and returns it on ReadAll. +type BlockEntry struct { + Number types.GlobalBlockNumber + Block *types.Block +} From 13bbeb39bb578beaaced07065081204c124b8e0b Mon Sep 17 00:00:00 2001 From: Wen Date: Fri, 15 May 2026 14:29:37 -0700 Subject: [PATCH 2/7] =?UTF-8?q?data:=20drop=20Flush()=20=E2=80=94=20Writes?= =?UTF-8?q?=20are=20synchronous-on-durability=20instead=20(CON-272)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removing Flush from the Store interface. The existing FullCommitQCPersister and GlobalBlockPersister don't have Flush either; data.State.runPersist already relies on PersistQC/PersistBlock returning only after the write is on disk in order to advance nextBlockToPersist (which gates PushAppHash → AppVote durability). Codify that directly: WriteBlock and WriteQC return only after the record is durable. Implementations that want to batch fsyncs internally can do so, but the individual Write call still blocks until the batch covering it has been committed. Smaller interface, no semantic change vs. the WAL it replaces. If a future implementation needs an async fast-path with a separate Flush, we can add it then with a real use case to design against. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/autobahn/data/store.go | 55 +++++++++---------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/sei-tendermint/internal/autobahn/data/store.go b/sei-tendermint/internal/autobahn/data/store.go index f88d9148dc..654e2600b8 100644 --- a/sei-tendermint/internal/autobahn/data/store.go +++ b/sei-tendermint/internal/autobahn/data/store.go @@ -25,18 +25,25 @@ import ( // // # Crash safety // -// Each Write* call is individually atomic: after a crash a reader will -// either see the entire write or none of it; partial writes are not -// possible. Writes are not atomic with respect to one another — a crash -// between two Write calls leaves the earlier one durable and the later -// one absent. Reconciliation of cross-record inconsistencies (e.g. a -// block written without its QC, or vice versa) is the caller's -// responsibility on startup (see DataWAL.reconcile for the rules the -// current WAL uses). +// Write* methods are synchronous with respect to durability: a Write +// returns only after the record is durable on disk. A reader after a +// crash either sees the entire write (if it returned) or none of it +// (if it had not yet returned); partial writes are not possible. // -// Read-your-writes is provided within a single process session: a Write -// followed by a Read in the same process always observes the Write, -// even if the Write has not yet been flushed to disk. +// Writes are not atomic with respect to one another — a crash between +// two Write calls leaves the earlier one durable and the later one +// absent. Reconciliation of cross-record inconsistencies (e.g. a block +// written without its QC, or vice versa) is the caller's responsibility +// on startup (see DataWAL.reconcile for the rules the current WAL uses). +// +// The synchronous-durability guarantee is what +// data.State.runPersist relies on to advance nextBlockToPersist (and +// thereby unblock PushAppHash → AppVote): once WriteBlock/WriteQC +// return, the data underpinning the next AppVote is on disk. No +// separate Flush method is exposed; if an implementation wants to +// batch and amortize fsyncs internally, it must still block the +// individual Write call until the batch covering it has been +// committed. // // # Ordering // @@ -69,8 +76,8 @@ type Store interface { // different n, is a contract violation — implementations are free // to error or to corrupt state in that case. // - // This method may return before the write is on disk; callers that - // need crash durability must call Flush. + // Returns only after the block is durable on disk. See the Store + // type doc for the synchronous-durability contract. WriteBlock(ctx context.Context, n types.GlobalBlockNumber, block *types.Block) error // WriteQC persists a FullCommitQC. @@ -85,8 +92,8 @@ type Store interface { // Idempotent on duplicate: a second WriteQC for a QC with the same // GlobalRange().First is a no-op. // - // This method may return before the write is on disk; callers that - // need crash durability must call Flush. + // Returns only after the QC is durable on disk. See the Store type + // doc for the synchronous-durability contract. WriteQC(ctx context.Context, qc *types.FullCommitQC) error // PruneBefore removes: @@ -105,18 +112,6 @@ type Store interface { // record still being processed is undefined. PruneBefore(ctx context.Context, n types.GlobalBlockNumber) error - // Flush blocks until all Writes that returned before Flush() was - // called are durable on disk. Calls to Write made concurrently with - // Flush may or may not be durable when Flush returns (but are - // otherwise eventually durable). - // - // Flush is not required for correctness within a single process - // (read-your-writes is always provided); it exists so data.State - // can synchronize with disk for the AppHash-vote durability - // guarantee — AppVotes must only be issued for data that survives - // a crash. - Flush(ctx context.Context) error - // ReadAll returns a snapshot of all blocks and QCs not yet pruned, // for startup replay. Intended to be called once at construction by // data.State.NewState; afterwards the in-memory cursors track @@ -162,9 +157,9 @@ type Store interface { // n yet, or n is below the retention watermark. Non-blocking. ReadQCByBlockNumber(ctx context.Context, n types.GlobalBlockNumber) (*types.FullCommitQC, bool, error) - // Close releases resources held by the store. Any in-flight writes - // are flushed to disk before Close returns. After Close returns, no - // other method may be called on the Store; doing so is undefined. + // Close releases resources held by the store. After Close returns, + // no other method may be called on the Store; doing so is + // undefined. Close(ctx context.Context) error } From 480f6a67ddd5bf5bbe4b620fdbfc4a46568cd8b8 Mon Sep 17 00:00:00 2001 From: Wen Date: Fri, 15 May 2026 14:33:01 -0700 Subject: [PATCH 3/7] data: trim residual Flush mention from Store doc Drop the "No separate Flush method is exposed; if an implementation wants to..." sentence from the type comment. The synchronous-durability contract above already covers the substance. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-tendermint/internal/autobahn/data/store.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sei-tendermint/internal/autobahn/data/store.go b/sei-tendermint/internal/autobahn/data/store.go index 654e2600b8..9d14f4b7bc 100644 --- a/sei-tendermint/internal/autobahn/data/store.go +++ b/sei-tendermint/internal/autobahn/data/store.go @@ -39,11 +39,9 @@ import ( // The synchronous-durability guarantee is what // data.State.runPersist relies on to advance nextBlockToPersist (and // thereby unblock PushAppHash → AppVote): once WriteBlock/WriteQC -// return, the data underpinning the next AppVote is on disk. No -// separate Flush method is exposed; if an implementation wants to -// batch and amortize fsyncs internally, it must still block the -// individual Write call until the batch covering it has been -// committed. +// return, the data underpinning the next AppVote is on disk. +// Implementations that batch fsyncs internally must still block the +// individual Write call until the batch covering it has been committed. // // # Ordering // From d4bfd4064a93333278a6769a5aade9646e9b2f09 Mon Sep 17 00:00:00 2001 From: Wen Date: Fri, 15 May 2026 14:34:16 -0700 Subject: [PATCH 4/7] data: spell out GlobalRange half-open interval convention in Store doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reframe the "Ordering" section as "Ordering and the GlobalRange convention". Calls out that GlobalRange is [First(), Next()) — First inclusive, Next exclusive — so the QC covers First, First+1, ..., Next-1, and Next is the First of the next contiguous QC. The convention was implicit before (used in ReadQCByBlockNumber's "GlobalRange().First ≤ n < GlobalRange().Next" and Loaded.QCs's "[First, Next)" but never stated outright). Implementers shouldn't have to reverse-engineer it from those scattered references. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-tendermint/internal/autobahn/data/store.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sei-tendermint/internal/autobahn/data/store.go b/sei-tendermint/internal/autobahn/data/store.go index 9d14f4b7bc..6cbe3ad16d 100644 --- a/sei-tendermint/internal/autobahn/data/store.go +++ b/sei-tendermint/internal/autobahn/data/store.go @@ -43,7 +43,12 @@ import ( // Implementations that batch fsyncs internally must still block the // individual Write call until the batch covering it has been committed. // -// # Ordering +// # Ordering and the GlobalRange convention +// +// A FullCommitQC's GlobalRange is a half-open interval +// [GlobalRange.First(), GlobalRange.Next()) — First inclusive, Next +// exclusive. The QC therefore covers GlobalBlockNumbers First, First+1, +// ..., Next-1, and Next is also the First of the next contiguous QC. // // QCs must be written contiguously — each WriteQC's // qc.QC().GlobalRange(committee).First must equal the previous WriteQC's From 7dcd6bf497bd8684e805e7e1f5a36056734307f6 Mon Sep 17 00:00:00 2001 From: Wen Date: Fri, 15 May 2026 14:36:27 -0700 Subject: [PATCH 5/7] data: spell out what Store does NOT persist (AppHash, exec results) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a "What this does NOT store" section to the Store type doc. Heads off the natural question — "don't we need to persist execution results?" — by walking through why AppHash recovery works without storing it: - App.Info().LastBlockAppHash on restart gives us the AppHash for the last committed height (lives in the app's CMS, not in data.State or DataWAL) - Heights above that are re-executed from replayed blocks + re-derived AppHashes - GlobalBlock.FinalAppState comes from qc.Proposal().App(), extracted from the persisted QC — no separate record needed Per-tx execution results / logs / events live on the receipt store (canonical txHash → execution result, per the Giga Transaction Query proposal). Store stays scoped to blocks + QCs. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-tendermint/internal/autobahn/data/store.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sei-tendermint/internal/autobahn/data/store.go b/sei-tendermint/internal/autobahn/data/store.go index 6cbe3ad16d..bab433aa9e 100644 --- a/sei-tendermint/internal/autobahn/data/store.go +++ b/sei-tendermint/internal/autobahn/data/store.go @@ -16,6 +16,24 @@ import ( // superset of what DataWAL.Blocks + DataWAL.CommitQCs provide today, plus // a by-hash block index the WAL does not have. // +// # What this does NOT store +// +// Deliberately scoped to blocks + QCs. In particular: +// +// - AppHash (per-height execution result). On restart the app reports +// its last committed (height, AppHash) via app.Info().LastBlockAppHash; +// runExecute injects that back via PushAppHash, and AppHashes for +// heights above the app's last committed height are regenerated by +// re-executing the replayed blocks. data.State.inner.appProposals is +// in-memory only. +// - Per-tx execution results, logs, events. Those live in the receipt +// store (canonical txHash → execution result lookup, unified for +// EVM and Cosmos txs per the Giga Transaction Query proposal). +// +// GlobalBlock.FinalAppState is extracted from qc.Proposal().App() on +// read — derivable from the persisted QC, no separate AppHash record +// needed. +// // # Concurrency // // All methods are safe for concurrent use. Implementations should expect From ccc3121764f47e4ab4a860a0c917cc007fbc9ebc Mon Sep 17 00:00:00 2001 From: Wen Date: Fri, 15 May 2026 14:37:29 -0700 Subject: [PATCH 6/7] data: switch Store reads from (*T, bool, error) to (utils.Option[*T], error) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per reviewer preference for utils.Option over Go's (value, bool) tuple pattern. The autobahn data package can freely import sei-tendermint/libs/utils, so no layering concern (unlike sei-db, where Transaction.Result() stayed on (bytes, bool) because sei-db can't see the Option type). Changed: - ReadBlockByNumber(ctx, n) → (utils.Option[*types.Block], error) - ReadBlockByHash(ctx, hash) → (utils.Option[*types.Block], error) - ReadQCByBlockNumber(ctx, n) → (utils.Option[*types.FullCommitQC], error) Updated doc comments to refer to utils.None where they previously said (nil, false, nil). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/autobahn/data/store.go | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/sei-tendermint/internal/autobahn/data/store.go b/sei-tendermint/internal/autobahn/data/store.go index bab433aa9e..1aa8d2bf67 100644 --- a/sei-tendermint/internal/autobahn/data/store.go +++ b/sei-tendermint/internal/autobahn/data/store.go @@ -4,6 +4,7 @@ import ( "context" "github.com/sei-protocol/sei-chain/sei-tendermint/internal/autobahn/types" + "github.com/sei-protocol/sei-chain/sei-tendermint/libs/utils" ) // Store is the durable backing store for data.State. It persists the two @@ -152,21 +153,21 @@ type Store interface { // ReadBlockByNumber returns the block at GlobalBlockNumber n. // - // Returns (nil, false, nil) if no block has been written at n, or - // the block at n has been pruned. Implementations must not block + // Returns utils.None if no block has been written at n, or the + // block at n has been pruned. Implementations must not block // waiting for a future write — "not yet written" is reported as - // (nil, false, nil) identical to "never written". Blocking - // semantics (wait for a write at n) live above this interface, in + // utils.None identical to "never written". Blocking semantics + // (wait for a write at n) live above this interface, in // data.State. - ReadBlockByNumber(ctx context.Context, n types.GlobalBlockNumber) (*types.Block, bool, error) + ReadBlockByNumber(ctx context.Context, n types.GlobalBlockNumber) (utils.Option[*types.Block], error) // ReadBlockByHash returns the block whose header hashes to the // given value. The hash is the same value as block.Header().Hash() // for the block that was passed to WriteBlock. // - // Returns (nil, false, nil) if no such block has been written, or - // it has been pruned. Like ReadBlockByNumber, this is non-blocking. - ReadBlockByHash(ctx context.Context, hash types.BlockHeaderHash) (*types.Block, bool, error) + // Returns utils.None if no such block has been written, or it has + // been pruned. Like ReadBlockByNumber, this is non-blocking. + ReadBlockByHash(ctx context.Context, hash types.BlockHeaderHash) (utils.Option[*types.Block], error) // ReadQCByBlockNumber returns the FullCommitQC whose // GlobalRange().First ≤ n < GlobalRange().Next — i.e. the QC that @@ -174,9 +175,9 @@ type Store interface { // blocks, the same *FullCommitQC is returned for every n in its // range. // - // Returns (nil, false, nil) if no QC has been written that covers - // n yet, or n is below the retention watermark. Non-blocking. - ReadQCByBlockNumber(ctx context.Context, n types.GlobalBlockNumber) (*types.FullCommitQC, bool, error) + // Returns utils.None if no QC has been written that covers n yet, + // or n is below the retention watermark. Non-blocking. + ReadQCByBlockNumber(ctx context.Context, n types.GlobalBlockNumber) (utils.Option[*types.FullCommitQC], error) // Close releases resources held by the store. After Close returns, // no other method may be called on the Store; doing so is From f295435087f82b245631100408a8f3e2409f9d7d Mon Sep 17 00:00:00 2001 From: Wen Date: Tue, 19 May 2026 11:28:27 -0700 Subject: [PATCH 7/7] =?UTF-8?q?data:=20bring=20Flush()=20back=20=E2=80=94?= =?UTF-8?q?=20two-phase=20write/flush=20for=20batched=20fsync=20(CON-272)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer note (LittDB perspective): synchronous-fsync-per-Write is real disk bandwidth. At dozens of blocks/sec the per-record fsync starts crowding out useful I/O, regardless of which DB sits underneath. Pattern that works better is two-phase: - WriteBlock / WriteQC return without a durability guarantee - Caller batches what it wants made durable together - Flush once at the batch boundary Implementation is still free to start writing as records arrive — so this batches better than the alternative of buffering until a batch is "closed." Reworked the Store doc to spell out: - The two-phase write/flush contract - The expected runPersist pattern (drain queue, write, flush, then advance nextBlockToPersist → gates AppVote) - Cross-write atomicity is still the caller's problem (Flush gives "everything before this is durable," not "everything before this is atomic") - Read-your-writes within a session is independent of Flush - Implementations should still write eagerly without a Flush — Flush is "wait until durable," not "tell the impl to start writing" WriteBlock and WriteQC docs now point at the two-phase contract and explain the runPersist → nextBlockToPersist → AppVote durability chain. Flush itself has a doc that walks through the same pattern. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/autobahn/data/store.go | 85 ++++++++++++++----- 1 file changed, 62 insertions(+), 23 deletions(-) diff --git a/sei-tendermint/internal/autobahn/data/store.go b/sei-tendermint/internal/autobahn/data/store.go index 1aa8d2bf67..1e2174ad86 100644 --- a/sei-tendermint/internal/autobahn/data/store.go +++ b/sei-tendermint/internal/autobahn/data/store.go @@ -42,25 +42,43 @@ import ( // background persistence loop) and concurrent reads from RPC handlers // and peer-sync streams. // -// # Crash safety -// -// Write* methods are synchronous with respect to durability: a Write -// returns only after the record is durable on disk. A reader after a -// crash either sees the entire write (if it returned) or none of it -// (if it had not yet returned); partial writes are not possible. -// -// Writes are not atomic with respect to one another — a crash between -// two Write calls leaves the earlier one durable and the later one -// absent. Reconciliation of cross-record inconsistencies (e.g. a block -// written without its QC, or vice versa) is the caller's responsibility -// on startup (see DataWAL.reconcile for the rules the current WAL uses). -// -// The synchronous-durability guarantee is what -// data.State.runPersist relies on to advance nextBlockToPersist (and -// thereby unblock PushAppHash → AppVote): once WriteBlock/WriteQC -// return, the data underpinning the next AppVote is on disk. -// Implementations that batch fsyncs internally must still block the -// individual Write call until the batch covering it has been committed. +// # Durability and crash safety +// +// Writes are two-phase: WriteBlock and WriteQC return without +// guaranteeing the record is on disk. Flush blocks until all +// previously-returned Writes are durable. A Write+Flush pair is +// individually atomic on disk — after a crash a reader either sees +// the full record (if Flush returned before the crash) or none of it +// (otherwise); partial writes are not possible. +// +// The two-phase shape is intentional: a disk-backed implementation +// needs an fsync per durability boundary, and fsync-per-Write at +// dozens of blocks/sec is real disk bandwidth. The typical pattern +// is "write a batch of records, then Flush once" — e.g. +// runPersist drains every block + QC currently queued for +// persistence, writes them all, calls Flush, and only then advances +// nextBlockToPersist. The implementation is free to begin writing as +// soon as the first record arrives (so this batches better than +// blocking on a closed input), and the consumer never pays an fsync +// per record. +// +// Writes are not atomic with respect to one another even within a +// single Flush — a crash between two Writes (or mid-Flush) leaves +// some records on disk and others not. Reconciliation of +// cross-record inconsistencies (e.g. a block written without its QC, +// or vice versa) is the caller's responsibility on startup (see +// DataWAL.reconcile for the rules the current WAL uses). +// +// Read-your-writes is provided within a single session regardless of +// Flush — a Write followed by a Read in the same process always +// observes the Write. Flush is about disk durability, not in-process +// visibility. +// +// Implementations are required to make sure that durability happens +// reasonably eagerly even without an explicit Flush — a node that +// stops calling Flush should still see its writes eventually land on +// disk. Flush is "wait until durable now," not "tell the +// implementation to start writing." // // # Ordering and the GlobalRange convention // @@ -98,8 +116,11 @@ type Store interface { // different n, is a contract violation — implementations are free // to error or to corrupt state in that case. // - // Returns only after the block is durable on disk. See the Store - // type doc for the synchronous-durability contract. + // May return before the block is on disk. Callers that need crash + // durability before some external observable action (e.g. + // runPersist advancing nextBlockToPersist, which gates the + // AppVote runExecute issues) must call Flush. See the Store type + // doc for the two-phase write/flush contract. WriteBlock(ctx context.Context, n types.GlobalBlockNumber, block *types.Block) error // WriteQC persists a FullCommitQC. @@ -114,8 +135,9 @@ type Store interface { // Idempotent on duplicate: a second WriteQC for a QC with the same // GlobalRange().First is a no-op. // - // Returns only after the QC is durable on disk. See the Store type - // doc for the synchronous-durability contract. + // May return before the QC is on disk. See the Store type doc for + // the two-phase write/flush contract and WriteBlock for the + // rationale. WriteQC(ctx context.Context, qc *types.FullCommitQC) error // PruneBefore removes: @@ -134,6 +156,23 @@ type Store interface { // record still being processed is undefined. PruneBefore(ctx context.Context, n types.GlobalBlockNumber) error + // Flush blocks until every Write that has returned before Flush is + // called is durable on disk. Writes made concurrently with Flush + // may or may not be durable when Flush returns (but are otherwise + // eventually durable — implementations write to disk on their own + // schedule even without an explicit Flush). + // + // The expected pattern is "write a batch of records, then Flush + // once" rather than "Flush after every Write." The implementation + // is free to begin writing as records arrive, so this still + // batches well even when the caller doesn't pre-buffer. + // + // data.State.runPersist will use this: drain every block + QC + // queued for persistence, write them all, call Flush, then + // advance nextBlockToPersist (the watermark gating AppVote + // issuance). + Flush(ctx context.Context) error + // ReadAll returns a snapshot of all blocks and QCs not yet pruned, // for startup replay. Intended to be called once at construction by // data.State.NewState; afterwards the in-memory cursors track