From 2f6cc49578918731c566fa7a844e062853ea4144 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Tue, 23 Jun 2026 22:18:01 -0700 Subject: [PATCH 01/15] first cut at mini_allocator-based prefetcher Signed-off-by: Rob Johnson --- include/splinterdb/splinterdb.h | 6 + src/btree.c | 218 ++++++++++++++++++++++++++---- src/btree.h | 31 +++++ src/btree_private.h | 5 + src/core.c | 61 ++++++++- src/core.h | 10 ++ src/mini_allocator.c | 179 ++++++++++++++++++++++-- src/mini_allocator.h | 61 +++++++++ src/splinterdb.c | 4 + tests/config.c | 4 + tests/config.h | 3 + tests/functional/btree_test.c | 22 ++- tests/functional/scan_benchmark.c | 1 + tests/functional/test.h | 1 + tests/unit/btree_test.c | 4 +- 15 files changed, 562 insertions(+), 48 deletions(-) diff --git a/include/splinterdb/splinterdb.h b/include/splinterdb/splinterdb.h index fdb35d4e..cd0f5a18 100644 --- a/include/splinterdb/splinterdb.h +++ b/include/splinterdb/splinterdb.h @@ -146,6 +146,12 @@ typedef struct splinterdb_config { // work to be performed on foreground threads, increasing tail // latencies. uint64 queue_scale_percent; + + // Total bytes of extent read-ahead a range scan keeps in flight, divided + // across the branches it merges. Roughly the storage's bandwidth-delay + // product (bandwidth x latency); raise it for higher-latency devices such + // as networked/cloud volumes. Zero selects a default suited to local SSDs. + uint64 prefetch_budget; } splinterdb_config; /////////////////////////////////////// diff --git a/src/btree.c b/src/btree.c index 5209bcbc..235e1e91 100644 --- a/src/btree.c +++ b/src/btree.c @@ -2841,6 +2841,126 @@ btree_iterator_find_end_addr_async(btree_iterator_async_state *state, async_return(state); } +/* + * ---------------------------------------------------------------------------- + * btree_prefetch_cursor -- + * + * Forward extent-prefetcher for a btree_iterator. It reads extent addresses + * ahead of the iterator from the branch's mini_allocator (via a + * mini_meta_cursor) and issues cache_prefetch for them, keeping ~lookahead + * leaf extents of IO in flight. Within the iterator's level (batch), the + * extents are in key order, so the cursor advances in lockstep with + * consumption. Internal-node extents are skipped; blob extents are + * prefetched for height-0 scans. See btree_prefetch_cursor in btree.h. + * ---------------------------------------------------------------------------- + */ +static inline uint64 +btree_extent_base_addr(cache *cc, uint64 addr) +{ + allocator *al = cache_get_allocator(cc); + return allocator_config_extent_base_addr(allocator_get_config(al), addr); +} + +/* + * Issue prefetches until ~lookahead leaf extents are in flight, or we reach the + * iterator's end extent or the end of the extent stream. Leaf extents count + * toward the lookahead; blob extents in the window are prefetched but not + * counted; internal-node extents are skipped. + */ +static void +btree_prefetch_cursor_fill(btree_iterator *itor) +{ + btree_prefetch_cursor *pf = &itor->prefetch; + while (!pf->at_end && pf->prefetched_ahead < pf->lookahead) { + uint64 extent_addr; + uint64 batch; + if (!mini_meta_cursor_next(&pf->meta_cursor, &extent_addr, &batch)) { + pf->at_end = TRUE; + break; + } + if (batch == pf->leaf_batch) { + cache_prefetch(itor->cc, extent_addr, itor->page_type); + pf->prefetched_ahead++; + // Never prefetch past the extent that contains end_addr. + if (btree_addrs_share_extent(itor->cc, extent_addr, itor->end_addr)) { + pf->at_end = TRUE; + } + } else if (pf->prefetch_blobs && batch < NUM_BLOB_BATCHES) { + cache_prefetch(itor->cc, extent_addr, PAGE_TYPE_BLOB); + } + // else: internal-node extent (batch > leaf_batch) -- skip. + } +} + +/* + * (Re)initialize the prefetch cursor at the iterator's current leaf and prime + * it. Falls back to disabled (legacy single-extent prefetch) when deep prefetch + * does not apply or the start extent cannot be located. + */ +static void +btree_prefetch_cursor_init(btree_iterator *itor) +{ + btree_prefetch_cursor *pf = &itor->prefetch; + + // Reset any previously-active cursor (e.g. on a seek). + if (pf->enabled) { + mini_meta_cursor_deinit(&pf->meta_cursor); + } + pf->enabled = FALSE; + pf->at_end = FALSE; + pf->prefetched_ahead = 0; + pf->leaf_batch = NUM_BLOB_BATCHES + itor->height; + pf->prefetch_blobs = (itor->height == 0); + + // Deep prefetch applies only to forward scans of finalized branches with a + // lookahead of 2+; everything else uses the legacy next_extent_addr path. + if (!itor->do_prefetch || pf->lookahead <= 1 + || itor->page_type != PAGE_TYPE_BRANCH || itor->curr.page == NULL) + { + return; + } + + uint64 meta_page_addr = itor->curr.hdr->meta_page_addr; + if (meta_page_addr == 0) { + return; // node predates the stamp, or a tiny tree: fall back to legacy. + } + + mini_meta_cursor_init( + &pf->meta_cursor, itor->cc, itor->page_type, meta_page_addr); + uint64 start_extent = btree_extent_base_addr(itor->cc, itor->curr.addr); + if (!mini_meta_cursor_seek_extent(&pf->meta_cursor, start_extent)) { + mini_meta_cursor_deinit(&pf->meta_cursor); + return; // couldn't locate our extent; fall back to legacy. + } + pf->enabled = TRUE; + btree_prefetch_cursor_fill(itor); +} + +/* + * Called when the iterator crosses forward into a new leaf extent: account for + * the consumed extent and refill the lookahead window. + */ +static void +btree_prefetch_cursor_on_boundary(btree_iterator *itor) +{ + btree_prefetch_cursor *pf = &itor->prefetch; + if (pf->prefetched_ahead > 0) { + pf->prefetched_ahead--; + } + btree_prefetch_cursor_fill(itor); +} + +/* Release the cursor's resources and turn it off. */ +static void +btree_prefetch_cursor_deinit(btree_iterator *itor) +{ + btree_prefetch_cursor *pf = &itor->prefetch; + if (pf->enabled) { + mini_meta_cursor_deinit(&pf->meta_cursor); + pf->enabled = FALSE; + } +} + /* * ---------------------------------------------------------------------------- * Move to the next leaf when we've reached the end of one leaf but @@ -2859,16 +2979,18 @@ btree_iterator_next_leaf(btree_iterator *itor) itor->idx = 0; itor->curr_min_idx = -1; - // To prefetch: - // 1. we just moved from one extent to the next - // 2. this can't be the last extent - if (itor->do_prefetch - && !btree_addrs_share_extent(cc, last_addr, itor->curr.addr) - && itor->curr.hdr->next_extent_addr != 0 - && !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr)) - { - // IO prefetch the next extent - cache_prefetch(cc, itor->curr.hdr->next_extent_addr, itor->page_type); + // Only act when we just moved from one extent to the next. + if (!btree_addrs_share_extent(cc, last_addr, itor->curr.addr)) { + if (itor->prefetch.enabled) { + // Deep prefetch: refill the lookahead window of leaf extents. + btree_prefetch_cursor_on_boundary(itor); + } else if (itor->do_prefetch && itor->curr.hdr->next_extent_addr != 0 + && !btree_addrs_share_extent( + cc, itor->curr.addr, itor->end_addr)) + { + // Legacy single-extent-ahead prefetch (this can't be the last extent). + cache_prefetch(cc, itor->curr.hdr->next_extent_addr, itor->page_type); + } } } @@ -2898,20 +3020,24 @@ btree_iterator_next_leaf_async(btree_iterator_async_state *state, uint64 depth) state->itor->idx = 0; state->itor->curr_min_idx = -1; - // To prefetch: - // 1. we just moved from one extent to the next - // 2. this can't be the last extent - if (state->itor->do_prefetch - && !btree_addrs_share_extent( - state->itor->cc, state->last_addr, state->itor->curr.addr) - && state->itor->curr.hdr->next_extent_addr != 0 - && !btree_addrs_share_extent( - state->itor->cc, state->itor->curr.addr, state->itor->end_addr)) + // Only act when we just moved from one extent to the next. + if (!btree_addrs_share_extent( + state->itor->cc, state->last_addr, state->itor->curr.addr)) { - // IO prefetch the next extent - cache_prefetch(state->itor->cc, - state->itor->curr.hdr->next_extent_addr, - state->itor->page_type); + if (state->itor->prefetch.enabled) { + // Deep prefetch: refill the lookahead window of leaf extents. + btree_prefetch_cursor_on_boundary(state->itor); + } else if (state->itor->do_prefetch + && state->itor->curr.hdr->next_extent_addr != 0 + && !btree_addrs_share_extent(state->itor->cc, + state->itor->curr.addr, + state->itor->end_addr)) + { + // Legacy single-extent-ahead prefetch (this can't be the last extent). + cache_prefetch(state->itor->cc, + state->itor->curr.hdr->next_extent_addr, + state->itor->page_type); + } } async_return(state); @@ -2927,6 +3053,9 @@ btree_iterator_prev_leaf(btree_iterator *itor) { const btree_config *cfg = itor->cfg; + // We don't prefetch backward; turn off any forward prefetch cursor. + btree_prefetch_cursor_deinit(itor); + debug_only uint64 curr_addr = itor->curr.addr; /* * Copied nodes can have stale prev_addr values. Read the live current node @@ -2983,6 +3112,9 @@ btree_iterator_prev_leaf_async(btree_iterator_async_state *state, uint64 depth) { async_begin(state, depth); + // We don't prefetch backward; turn off any forward prefetch cursor. + btree_prefetch_cursor_deinit(state->itor); + state->curr_addr = state->itor->curr.addr; if (btree_iterator_curr_is_copy(state->itor)) { state->live_curr.addr = state->curr_addr; @@ -3390,6 +3522,10 @@ btree_iterator_seek(iterator *base_itor, comparison seek_type, key seek_key) find_btree_node_and_get_idx_bounds(itor, seek_key, seek_type); } + // The iterator may have repositioned; re-anchor the prefetch cursor so a + // subsequent forward scan prefetches from the new location. + btree_prefetch_cursor_init(itor); + return STATUS_OK; } @@ -3446,6 +3582,7 @@ btree_iterator_init_common(cache *cc, bool32 do_prefetch, bool32 copy_nodes, uint32 height, + uint32 prefetch_lookahead, key *normalized_start_key) { platform_assert(root_addr != 0); @@ -3484,6 +3621,7 @@ btree_iterator_init_common(cache *cc, itor->max_key = max_key; itor->page_type = page_type; itor->super.ops = &btree_iterator_ops; + itor->prefetch.lookahead = prefetch_lookahead; if (copy_nodes) { itor->node_copy = TYPED_MANUAL_MALLOC( PROCESS_PRIVATE_HEAP_ID, itor->node_copy, btree_page_size(itor->cfg)); @@ -3513,6 +3651,9 @@ btree_iterator_init(cache *cc, bool32 copy_nodes, uint32 height) { + // The synchronous init defaults to the legacy single-extent-ahead prefetch. + // Callers wanting deep prefetch call btree_iterator_set_prefetch_lookahead() + // afterward (the async path threads a lookahead in directly). platform_status rc = btree_iterator_init_common(cc, cfg, itor, @@ -3526,6 +3667,7 @@ btree_iterator_init(cache *cc, do_prefetch, copy_nodes, height, + 1, &start_key); if (!SUCCESS(rc)) { return rc; @@ -3533,10 +3675,12 @@ btree_iterator_init(cache *cc, find_btree_node_and_get_idx_bounds(itor, start_key, start_type); - if (itor->do_prefetch && itor->curr.hdr->next_extent_addr != 0 + btree_prefetch_cursor_init(itor); + if (!itor->prefetch.enabled && itor->do_prefetch + && itor->curr.hdr->next_extent_addr != 0 && !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr)) { - // IO prefetch the next extent + // Legacy single-extent-ahead prefetch (deep cursor not in use). cache_prefetch(cc, itor->curr.hdr->next_extent_addr, itor->page_type); } @@ -3546,6 +3690,17 @@ btree_iterator_init(cache *cc, return STATUS_OK; } +void +btree_iterator_set_prefetch_lookahead(btree_iterator *itor, + uint32 prefetch_lookahead) +{ + itor->prefetch.lookahead = prefetch_lookahead; + // Re-anchor the cursor at the current position with the new lookahead. This + // also issues an initial prefetch (replacing the legacy one-extent prefetch + // from init when the cursor engages). + btree_prefetch_cursor_init(itor); +} + async_status btree_iterator_init_async(btree_iterator_async_state *state) { @@ -3564,6 +3719,7 @@ btree_iterator_init_async(btree_iterator_async_state *state) state->do_prefetch, state->copy_nodes, state->height, + state->prefetch_lookahead, &state->target); if (!SUCCESS(rc)) { async_return(state, rc); @@ -3573,11 +3729,13 @@ btree_iterator_init_async(btree_iterator_async_state *state) async_await_subroutine(state, find_btree_node_and_get_idx_bounds_async); btree_iterator_copy_curr_if_needed(state->itor); - if (state->itor->do_prefetch && state->itor->curr.hdr->next_extent_addr != 0 + btree_prefetch_cursor_init(state->itor); + if (!state->itor->prefetch.enabled && state->itor->do_prefetch + && state->itor->curr.hdr->next_extent_addr != 0 && !btree_addrs_share_extent( state->cc, state->itor->curr.addr, state->itor->end_addr)) { - // IO prefetch the next extent + // Legacy single-extent-ahead prefetch (deep cursor not in use). cache_prefetch(state->cc, state->itor->curr.hdr->next_extent_addr, state->itor->page_type); @@ -3599,6 +3757,7 @@ void btree_iterator_deinit(btree_iterator *itor) { debug_assert(itor != NULL); + btree_prefetch_cursor_deinit(itor); btree_iterator_release_curr(itor); if (itor->node_copy != NULL) { platform_free(PROCESS_PRIVATE_HEAP_ID, itor->node_copy); @@ -3726,6 +3885,11 @@ btree_pack_create_next_node(btree_pack_req *req, uint64 height, key pivot) PAGE_TYPE_BRANCH, &new_node); btree_pack_node_init_hdr(req->cfg, new_node.hdr, 0, height); + // Record where this node's extent is listed in the mini_allocator meta + // stream, so an iterator can position a prefetch cursor in O(1). Must come + // after init_hdr, which zeroes the header. + new_node.hdr->meta_page_addr = + mini_current_extent_meta_page(&req->mini, NUM_BLOB_BATCHES + height); if (0 < req->num_edges[height]) { btree_node *old_node = btree_pack_get_current_node(req, height); diff --git a/src/btree.h b/src/btree.h index 1aa06a33..602a6b8b 100644 --- a/src/btree.h +++ b/src/btree.h @@ -130,6 +130,24 @@ typedef struct ONDISK btree_pivot_data { btree_pivot_stats stats; } btree_pivot_data; +/* + * Drives extent prefetching for a forward btree_iterator. Reads extent + * addresses ahead of the iterator from the branch's mini_allocator (via a + * mini_meta_cursor, exploiting that extents within a batch are in key order) + * and issues cache_prefetch for them, keeping ~lookahead leaf extents of IO in + * flight. Internal-node extents are skipped; blob extents are prefetched (for + * height-0 scans). Forward-only; disabled on backward moves. + */ +typedef struct btree_prefetch_cursor { + bool32 enabled; + bool32 at_end; // prefetched through the last in-range extent + uint32 lookahead; // K: target leaf extents in flight + uint64 leaf_batch; // mini batch of this iterator's level + bool32 prefetch_blobs; // also prefetch blob extents (height 0) + uint64 prefetched_ahead; // leaf extents prefetched, not yet consumed + mini_meta_cursor meta_cursor; +} btree_prefetch_cursor; + /* * A BTree iterator: */ @@ -155,6 +173,8 @@ typedef struct btree_iterator { uint64 end_addr; int64 end_idx; bool32 end_idx_valid; + + btree_prefetch_cursor prefetch; } btree_iterator; typedef struct btree_pack_req { @@ -316,6 +336,16 @@ btree_iterator_init(cache *cc, bool32 copy_nodes, uint32 height); +/* + * Set the extent-prefetch lookahead (in leaf extents) of an already-initialized + * iterator and re-anchor its prefetch cursor at the current position. A value + * >= 2 enables deep prefetch; <= 1 falls back to the legacy single-extent path. + * The iterator must have been initialized with do_prefetch == TRUE. + */ +void +btree_iterator_set_prefetch_lookahead(btree_iterator *itor, + uint32 prefetch_lookahead); + // clang-format off DEFINE_ASYNC_STATE(btree_iterator_async_state, 5, param, cache *, cc, @@ -332,6 +362,7 @@ DEFINE_ASYNC_STATE(btree_iterator_async_state, 5, param, bool32, do_prefetch, param, bool32, copy_nodes, param, uint32, height, + param, uint32, prefetch_lookahead, param, async_callback_fn, callback, param, void *, callback_arg, local, platform_status, __async_result, diff --git a/src/btree_private.h b/src/btree_private.h index 5b8995b2..f1083b3c 100644 --- a/src/btree_private.h +++ b/src/btree_private.h @@ -35,6 +35,11 @@ struct ONDISK btree_hdr { uint64 prev_addr; uint64 next_addr; uint64 next_extent_addr; + // Address of the mini_allocator meta page that lists this node's extent. + // Stamped at pack time (see btree_pack_create_next_node); lets a forward + // (or, later, backward) prefetch cursor jump straight to this node's + // position in the extent stream instead of scanning from meta_head. + uint64 meta_page_addr; uint64 generation; uint8 height; node_offset next_entry; diff --git a/src/core.c b/src/core.c index cbfb0868..c870dd6a 100644 --- a/src/core.c +++ b/src/core.c @@ -56,6 +56,37 @@ _Static_assert(CORE_NUM_MEMTABLES <= MAX_MEMTABLES, /* Some randomly chosen Splinter super-block checksum seed. */ #define CORE_SUPER_CSUM_SEED (42) +/* + * Minimum extent-prefetch depth for an eligible branch in a range scan. Keeping + * at least this many extents in flight is what makes deep prefetch worthwhile + * compared to the legacy single-extent-ahead path. + */ +#define CORE_MIN_PREFETCH_LOOKAHEAD (2) + +/* + * Per-branch extent-prefetch depth for a range scan. The configured prefetch + * budget (total bytes of read-ahead to keep in flight, ~ the storage's + * bandwidth-delay product) is converted to extents and divided across the + * eligible branches, with a floor of CORE_MIN_PREFETCH_LOOKAHEAD per branch. + * Dividing by the branch count bounds total outstanding read-ahead while still + * going deep when few branches dominate (a lone large branch gets the whole + * budget). Returns 0 when nothing is eligible to prefetch. + */ +static uint32 +core_prefetch_lookahead(core_handle *spl, uint64 n_eligible) +{ + if (n_eligible == 0) { + return 0; + } + uint64 budget_extents = + spl->cfg.prefetch_budget / cache_extent_size(spl->cc); + uint64 per_branch = budget_extents / n_eligible; + if (per_branch < CORE_MIN_PREFETCH_LOOKAHEAD) { + per_branch = CORE_MIN_PREFETCH_LOOKAHEAD; + } + return (uint32)per_branch; +} + /* * core logging functions. * @@ -878,7 +909,8 @@ core_start_btree_iterator_init_async( comparison start_key_comparison, key start_key, bool32 do_prefetch, - bool32 copy_nodes) + bool32 copy_nodes, + uint32 prefetch_lookahead) { btree_iterator_async_state_init(&ctxt->state, spl->cc, @@ -895,6 +927,7 @@ core_start_btree_iterator_init_async( do_prefetch, copy_nodes, 0, + prefetch_lookahead, core_btree_iterator_init_async_callback, ctxt); ctxt->ready = FALSE; @@ -1186,6 +1219,18 @@ core_range_iterator_init(core_handle *spl, return STATUS_NO_MEMORY; } + // Deep extent-prefetch for the scan: count the branches eligible to prefetch + // (compacted, and only when the scan is large enough to be worth it), then + // give each a share of the prefetch budget (see core_prefetch_lookahead). + uint64 n_prefetch_branches = 0; + for (uint64 branch_no = 0; branch_no < range_itor->num_branches; branch_no++) + { + if (range_itor->compacted[branch_no] && num_tuples > CORE_PREFETCH_MIN) { + n_prefetch_branches++; + } + } + uint32 deep_lookahead = core_prefetch_lookahead(spl, n_prefetch_branches); + uint64 started_inits = 0; for (uint64 i = 0; i < range_itor->num_branches; i++) { uint64 branch_no = range_itor->num_branches - i - 1; @@ -1193,11 +1238,10 @@ core_range_iterator_init(core_handle *spl, uint64 branch_addr = range_itor->branch[branch_no].addr; page_type page_type = range_itor->branch[branch_no].type; bool32 do_prefetch = FALSE; - if (range_itor->compacted[branch_no]) { - do_prefetch = - range_itor->compacted[branch_no] && num_tuples > CORE_PREFETCH_MIN - ? TRUE - : FALSE; + uint32 prefetch_lookahead = 1; + if (range_itor->compacted[branch_no] && num_tuples > CORE_PREFETCH_MIN) { + do_prefetch = TRUE; + prefetch_lookahead = deep_lookahead; } rc = core_start_btree_iterator_init_async( spl, @@ -1212,7 +1256,8 @@ core_range_iterator_init(core_handle *spl, start_key_comparison, start_key, do_prefetch, - branch_no == 0 ? first_memtable_copy_nodes : FALSE); + branch_no == 0 ? first_memtable_copy_nodes : FALSE, + prefetch_lookahead); started_inits++; if (!SUCCESS(rc)) { break; @@ -2483,6 +2528,7 @@ core_config_init(core_config *core_cfg, log_config *log_cfg, trunk_config *trunk_node_cfg, uint64 queue_scale_percent, + uint64 prefetch_budget, bool32 use_log, bool32 use_stats, bool32 verbose_logging, @@ -2499,6 +2545,7 @@ core_config_init(core_config *core_cfg, core_cfg->log_cfg = log_cfg; core_cfg->queue_scale_percent = queue_scale_percent; + core_cfg->prefetch_budget = prefetch_budget; core_cfg->use_log = use_log; core_cfg->use_stats = use_stats; core_cfg->verbose_logging_enabled = verbose_logging; diff --git a/src/core.h b/src/core.h index db90f982..0f6439e6 100644 --- a/src/core.h +++ b/src/core.h @@ -28,6 +28,10 @@ * Splinter Configuration structure *---------------------------------------------------------------------- */ +// Default range-scan prefetch budget (total extent read-ahead kept in flight), +// ~1 MiB == 8 extents at the default 128 KiB extent size. +#define CORE_DEFAULT_PREFETCH_BUDGET (1024UL * 1024) + typedef struct core_config { cache_config *cache_cfg; @@ -35,6 +39,11 @@ typedef struct core_config { uint64 queue_scale_percent; // Governs when inserters perform bg tasks. See // task.h + // Total bytes of extent read-ahead a range scan keeps in flight, divided + // across the branches it merges (see core_prefetch_lookahead). Roughly the + // storage's bandwidth-delay product; raise it for higher-latency devices. + uint64 prefetch_budget; + bool32 use_stats; // stats memtable_config mt_cfg; btree_config *btree_cfg; @@ -291,6 +300,7 @@ core_config_init(core_config *trunk_cfg, log_config *log_cfg, trunk_config *trunk_node_cfg, uint64 queue_scale_percent, + uint64 prefetch_budget, bool32 use_log, bool32 use_stats, bool32 verbose_logging, diff --git a/src/mini_allocator.c b/src/mini_allocator.c index 4a4a6170..51cb7bc3 100644 --- a/src/mini_allocator.c +++ b/src/mini_allocator.c @@ -33,6 +33,7 @@ */ typedef struct ONDISK mini_meta_hdr { uint64 next_meta_addr; + uint64 prev_meta_addr; uint64 pos; uint32 num_entries; char entry_buffer[]; @@ -48,11 +49,70 @@ typedef struct ONDISK mini_meta_hdr { * mini_allocator. Currently, this is just the extent address itself. *----------------------------------------------------------------------------- */ +/* + * A meta_entry is packed into a single 8-byte word to keep the extent list + * dense (it avoids ONDISK padding and wastes no space). Extents are at least + * 128 KiB, so an extent address has at least 17 always-zero low bits; we store + * the extent *number* (extent_addr / extent_size) instead of the address, + * which frees more than enough bits to also record the page type and the + * originating mini_allocator batch: + * + * bits [ 0: 7] batch (one of MINI_MAX_BATCHES batches) + * bits [ 8:15] type (page_type) + * bits [16:63] extent number (extent_addr / extent_size) + */ typedef struct ONDISK meta_entry { - uint64 extent_addr; - uint8 type; + uint64 packed; } meta_entry; +#define META_ENTRY_BATCH_BITS (8) +#define META_ENTRY_TYPE_BITS (8) +#define META_ENTRY_EXTENT_BITS (64 - META_ENTRY_BATCH_BITS - META_ENTRY_TYPE_BITS) + +_Static_assert(MINI_MAX_BATCHES <= (1 << META_ENTRY_BATCH_BITS), + "mini_allocator batch number does not fit in a meta_entry"); +_Static_assert(NUM_PAGE_TYPES <= (1 << META_ENTRY_TYPE_BITS), + "page_type does not fit in a meta_entry"); + +static inline uint64 +meta_entry_batch(const meta_entry *entry) +{ + return entry->packed & ((1 << META_ENTRY_BATCH_BITS) - 1); +} + +static inline page_type +meta_entry_type(const meta_entry *entry) +{ + return (page_type)((entry->packed >> META_ENTRY_BATCH_BITS) + & ((1 << META_ENTRY_TYPE_BITS) - 1)); +} + +static inline uint64 +meta_entry_extent_addr(cache *cc, const meta_entry *entry) +{ + uint64 extent_number = + entry->packed >> (META_ENTRY_BATCH_BITS + META_ENTRY_TYPE_BITS); + return extent_number * cache_extent_size(cc); +} + +static inline void +meta_entry_pack(cache *cc, + meta_entry *entry, + uint64 extent_addr, + page_type type, + uint64 batch) +{ + uint64 extent_size = cache_extent_size(cc); + uint64 extent_number = extent_addr / extent_size; + debug_assert((extent_addr % extent_size) == 0); + debug_assert(extent_number < (1ULL << META_ENTRY_EXTENT_BITS)); + debug_assert((uint64)type < (1 << META_ENTRY_TYPE_BITS)); + debug_assert(batch < (1 << META_ENTRY_BATCH_BITS)); + entry->packed = + (extent_number << (META_ENTRY_BATCH_BITS + META_ENTRY_TYPE_BITS)) + | ((uint64)type << META_ENTRY_BATCH_BITS) | batch; +} + static meta_entry * first_entry(page_handle *meta_page) { @@ -83,6 +143,7 @@ mini_init_meta_page(mini_allocator *mini, page_handle *meta_page) { mini_meta_hdr *hdr = (mini_meta_hdr *)meta_page->data; hdr->next_meta_addr = 0; + hdr->prev_meta_addr = 0; hdr->pos = offsetof(typeof(*hdr), entry_buffer); hdr->num_entries = 0; } @@ -284,7 +345,8 @@ static bool32 mini_append_entry_to_page(mini_allocator *mini, page_handle *meta_page, uint64 extent_addr, - page_type type) + page_type type, + uint64 batch) { uint64 page_size = cache_page_size(mini->cc); debug_assert(extent_addr != 0); @@ -296,9 +358,8 @@ mini_append_entry_to_page(mini_allocator *mini, return FALSE; } - meta_entry *new_entry = pointer_byte_offset(hdr, hdr->pos); - new_entry->extent_addr = extent_addr; - new_entry->type = type; + meta_entry *new_entry = pointer_byte_offset(hdr, hdr->pos); + meta_entry_pack(mini->cc, new_entry, extent_addr, type, batch); hdr->pos += sizeof(meta_entry); hdr->num_entries++; @@ -381,13 +442,22 @@ mini_set_next_meta_addr(mini_allocator *mini, hdr->next_meta_addr = next_meta_addr; } +static void +mini_set_prev_meta_addr(mini_allocator *mini, + page_handle *meta_page, + uint64 prev_meta_addr) +{ + mini_meta_hdr *hdr = (mini_meta_hdr *)meta_page->data; + hdr->prev_meta_addr = prev_meta_addr; +} + static bool32 mini_append_entry(mini_allocator *mini, uint64 batch, uint64 next_addr) { page_handle *meta_page = mini_full_lock_meta_tail(mini); bool32 success; - success = - mini_append_entry_to_page(mini, meta_page, next_addr, mini->types[batch]); + success = mini_append_entry_to_page( + mini, meta_page, next_addr, mini->types[batch], batch); if (!success) { // need to allocate a new meta page uint64 new_meta_tail = mini->meta_tail + cache_page_size(mini->cc); @@ -401,19 +471,27 @@ mini_append_entry(mini_allocator *mini, uint64 batch, uint64 next_addr) mini_set_next_meta_addr(mini, meta_page, new_meta_tail); page_handle *last_meta_page = meta_page; + uint64 last_meta_addr = mini->meta_tail; meta_page = cache_alloc(mini->cc, new_meta_tail, mini->meta_type); mini->meta_tail = new_meta_tail; mini_full_unlock_meta_page(mini, last_meta_page); mini_init_meta_page(mini, meta_page); + // Doubly-link the meta list so a prefetch cursor can scan it backward. + mini_set_prev_meta_addr(mini, meta_page, last_meta_addr); success = mini_append_entry_to_page( - mini, meta_page, next_addr, mini->types[batch]); + mini, meta_page, next_addr, mini->types[batch], batch); if (mini->pinned) { cache_pin(mini->cc, meta_page); } debug_assert(success); } + // Record the meta page that now holds this extent's entry, so btree nodes + // allocated from this extent can point straight at it (see + // mini_current_extent_meta_page). Safe: the caller holds the batch lock, so + // mini->meta_tail cannot advance under us until we unlock the page below. + mini->cur_extent_meta_page[batch] = mini->meta_tail; mini_full_unlock_meta_page(mini, meta_page); return TRUE; } @@ -726,7 +804,10 @@ mini_for_each_meta_page_func(cache *cc, uint64 num_meta_entries = mini_num_entries(meta_page); meta_entry *entry = first_entry(meta_page); for (uint64 i = 0; i < num_meta_entries; i++) { - fef->func(cc, entry->type, entry->extent_addr, fef->arg); + fef->func(cc, + meta_entry_type(entry), + meta_entry_extent_addr(cc, entry), + fef->arg); entry = next_entry(entry); } } @@ -824,6 +905,80 @@ mini_prefetch(cache *cc, page_type type, uint64 meta_head) mini_for_each(cc, meta_head, type, mini_prefetch_extent, NULL); } +/* + *----------------------------------------------------------------------------- + * mini_meta_cursor -- forward cursor over a mini_allocator's extent entries. + *----------------------------------------------------------------------------- + */ +void +mini_meta_cursor_init(mini_meta_cursor *cursor, + cache *cc, + page_type meta_type, + uint64 meta_addr) +{ + cursor->cc = cc; + cursor->meta_type = meta_type; + cursor->meta_page = NULL; + cursor->meta_addr = meta_addr; + cursor->entry_idx = 0; + cursor->num_entries = 0; +} + +void +mini_meta_cursor_deinit(mini_meta_cursor *cursor) +{ + if (cursor->meta_page != NULL) { + cache_unget(cursor->cc, cursor->meta_page); + cursor->meta_page = NULL; + } +} + +bool32 +mini_meta_cursor_next(mini_meta_cursor *cursor, + uint64 *extent_addr, + uint64 *batch) +{ + while (TRUE) { + if (cursor->meta_page == NULL) { + if (cursor->meta_addr == 0) { + return FALSE; + } + cursor->meta_page = + cache_get(cursor->cc, cursor->meta_addr, TRUE, cursor->meta_type); + cursor->num_entries = mini_num_entries(cursor->meta_page); + cursor->entry_idx = 0; + } + + if (cursor->entry_idx < cursor->num_entries) { + meta_entry *entry = first_entry(cursor->meta_page) + cursor->entry_idx; + *extent_addr = meta_entry_extent_addr(cursor->cc, entry); + *batch = meta_entry_batch(entry); + cursor->entry_idx++; + return TRUE; + } + + // Exhausted this page; advance to the next one (if any). + uint64 next_meta_addr = mini_get_next_meta_addr(cursor->meta_page); + cache_unget(cursor->cc, cursor->meta_page); + cursor->meta_page = NULL; + cursor->meta_addr = next_meta_addr; + } +} + +bool32 +mini_meta_cursor_seek_extent(mini_meta_cursor *cursor, + uint64 target_extent_addr) +{ + uint64 extent_addr; + uint64 batch; + while (mini_meta_cursor_next(cursor, &extent_addr, &batch)) { + if (extent_addr == target_extent_addr) { + return TRUE; + } + } + return FALSE; +} + static void space_use_add_extent(cache *cc, page_type type, uint64 extent_addr, void *out) { @@ -888,8 +1043,8 @@ mini_print(cache *cc, uint64 meta_head, page_type type) for (uint64 i = 0; i < num_entries; i++) { platform_default_log("| %3lu | %35lu | %s\n", i, - entry->extent_addr, - page_type_str[entry->type]); + meta_entry_extent_addr(cc, entry), + page_type_str[meta_entry_type(entry)]); entry = next_entry(entry); } platform_default_log("|-------------------------------------------|\n"); diff --git a/src/mini_allocator.h b/src/mini_allocator.h index f130ca78..33a02699 100644 --- a/src/mini_allocator.h +++ b/src/mini_allocator.h @@ -45,6 +45,12 @@ typedef struct mini_allocator { volatile uint64 next_addr[MINI_MAX_BATCHES]; uint64 saved_next_addr[MINI_MAX_BATCHES]; uint64 next_extent[MINI_MAX_BATCHES]; + // For each batch, the meta page that holds the entry for the extent the + // batch is currently allocating from. Lets a caller (e.g. the btree) record, + // in each page it allocates, where that page's extent is listed in the meta + // stream, so a prefetch cursor can start there without scanning from + // meta_head. See mini_current_extent_meta_page(). + uint64 cur_extent_meta_page[MINI_MAX_BATCHES]; } mini_allocator; uint64 @@ -107,6 +113,50 @@ mini_unblock_dec_ref(cache *cc, uint64 meta_head); void mini_prefetch(cache *cc, page_type type, uint64 meta_head); +/* + * mini_meta_cursor: a forward cursor over the extent entries of a finalized + * mini_allocator, in allocation order. Entries from all batches are + * interleaved in the stream; the caller filters by batch as needed (each entry + * reports its batch). The btree iterator uses this to read extent addresses + * ahead of itself for prefetching. + * + * The cursor holds a read reference on the meta page it is currently reading; + * call mini_meta_cursor_deinit() to release it. The cursor reads meta pages + * with blocking cache_get(); meta pages are tiny and become hot quickly, so a + * miss is rare, but callers on async paths should be aware it can block. + */ +typedef struct mini_meta_cursor { + cache *cc; + page_type meta_type; + page_handle *meta_page; // currently held meta page, or NULL + uint64 meta_addr; // addr of meta_page, or the next page to load + uint64 entry_idx; // index of the next entry to read on meta_page + uint64 num_entries; // number of entries on meta_page +} mini_meta_cursor; + +void +mini_meta_cursor_init(mini_meta_cursor *cursor, + cache *cc, + page_type meta_type, + uint64 meta_addr); + +void +mini_meta_cursor_deinit(mini_meta_cursor *cursor); + +// Emit the next extent entry (its extent address and originating batch) in +// allocation order. Returns FALSE once the stream is exhausted. +bool32 +mini_meta_cursor_next(mini_meta_cursor *cursor, + uint64 *extent_addr, + uint64 *batch); + +// Advance the cursor until it emits the entry for target_extent_addr, leaving +// the cursor positioned just after it. Returns FALSE if not found before the +// stream ends. +bool32 +mini_meta_cursor_seek_extent(mini_meta_cursor *cursor, + uint64 target_extent_addr); + /* Return total bytes allocated by the mini_allocator, including space used by * the mini_allocator itself.*/ uint64 @@ -121,6 +171,17 @@ mini_meta_tail(mini_allocator *mini) return mini->meta_tail; } +/* + * Address of the meta page holding the extent entry for the extent that batch + * is currently allocating from. Valid immediately after an allocation from + * batch (e.g. mini_alloc_page), for the thread that performed it. + */ +static inline uint64 +mini_current_extent_meta_page(mini_allocator *mini, uint64 batch) +{ + return mini->cur_extent_meta_page[batch]; +} + static inline uint64 mini_num_extents(mini_allocator *mini) diff --git a/src/splinterdb.c b/src/splinterdb.c index 10addc51..d4074bc1 100644 --- a/src/splinterdb.c +++ b/src/splinterdb.c @@ -160,6 +160,9 @@ splinterdb_config_set_defaults(splinterdb_config *cfg) if (!cfg->reclaim_threshold) { cfg->reclaim_threshold = UINT64_MAX; } + if (!cfg->prefetch_budget) { + cfg->prefetch_budget = CORE_DEFAULT_PREFETCH_BUDGET; + } } static platform_status @@ -290,6 +293,7 @@ splinterdb_init_config(const splinterdb_config *kvs_cfg, // IN (log_config *)&kvs->log_cfg, &kvs->trunk_node_cfg, cfg.queue_scale_percent, + cfg.prefetch_budget, cfg.use_log, cfg.use_stats, FALSE, diff --git a/tests/config.c b/tests/config.c index 2fe9a5f5..1f2f873d 100644 --- a/tests/config.c +++ b/tests/config.c @@ -90,6 +90,7 @@ config_set_defaults(master_config *cfg) .use_stats = FALSE, .reclaim_threshold = UINT64_MAX, .queue_scale_percent = TEST_CONFIG_DEFAULT_QUEUE_SCALE_PERCENT, + .prefetch_budget = MiB_TO_B(1), .verbose_logging_enabled = FALSE, .verbose_progress = FALSE, @@ -380,6 +381,9 @@ config_parse(master_config *cfg, const uint8 num_config, int argc, char *argv[]) config_set_gib("cache-capacity", cfg, cache_capacity) {} config_set_string("cache-debug-log", cfg, cache_logfile) {} config_set_uint64("queue-scale-percent", cfg, queue_scale_percent) {} + config_set_mib("prefetch-budget", cfg, prefetch_budget) {} + config_set_gib("prefetch-budget", cfg, prefetch_budget) {} + config_set_uint64("prefetch-budget-bytes", cfg, prefetch_budget) {} config_set_mib("memtable-capacity", cfg, memtable_capacity) {} config_set_gib("memtable-capacity", cfg, memtable_capacity) {} config_set_uint64("rough-count-height", cfg, btree_rough_count_height) diff --git a/tests/config.h b/tests/config.h index 2c474d83..089f06a1 100644 --- a/tests/config.h +++ b/tests/config.h @@ -96,6 +96,9 @@ typedef struct master_config { platform_log_handle *log_handle; + // prefetch + uint64 prefetch_budget; + // data uint64 key_size; uint64 message_size; diff --git a/tests/functional/btree_test.c b/tests/functional/btree_test.c index 7705a2d1..86b8113f 100644 --- a/tests/functional/btree_test.c +++ b/tests/functional/btree_test.c @@ -35,6 +35,7 @@ typedef struct btree_scan_perf_options { bool32 random_bounds; bool32 memtable_no_copy_nodes; bool32 memtable_copy_nodes; + uint32 prefetch_lookahead; // 0=no prefetch, 1=legacy single-extent, >=2 deep } btree_scan_perf_options; static const char * @@ -365,12 +366,15 @@ test_btree_scan_once(cache *cc, key min_key, key max_key, bool32 copy_nodes, + uint32 prefetch_lookahead, uint64 expected_tuples, uint64 *init_elapsed_ns, uint64 *scan_elapsed_ns, uint64 *tuples_scanned, uint64 *logical_bytes_scanned) { + // prefetch_lookahead 0 => no prefetch; 1 => legacy single-extent; >=2 => deep + bool32 do_prefetch = (prefetch_lookahead >= 1); btree_iterator itor; timestamp start_time = platform_get_timestamp(); platform_status rc = btree_iterator_init(cc, @@ -384,9 +388,12 @@ test_btree_scan_once(cache *cc, max_key, greater_than_or_equal, min_key, - FALSE, + do_prefetch, copy_nodes, 0); + if (SUCCESS(rc) && do_prefetch) { + btree_iterator_set_prefetch_lookahead(&itor, prefetch_lookahead); + } *init_elapsed_ns += platform_timestamp_elapsed(start_time); if (!SUCCESS(rc)) { return rc; @@ -540,6 +547,7 @@ test_btree_scan_benchmark_tree(cache *cc, min_key, max_key, copy_nodes, + options->prefetch_lookahead, expected_tuples, &init_elapsed_ns, &scan_elapsed_ns, @@ -1961,6 +1969,7 @@ btree_scan_perf_options_default(btree_scan_perf_options *options) .random_bounds = FALSE, .memtable_no_copy_nodes = TRUE, .memtable_copy_nodes = TRUE, + .prefetch_lookahead = 0, }; } @@ -2018,6 +2027,15 @@ btree_scan_perf_parse_args(int argc, platform_free(platform_get_heap_id(), filtered); return STATUS_BAD_PARAM; } + } else if (STRING_EQUALS_LITERAL(argv[i], "--prefetch-lookahead")) { + uint64 lookahead; + if (i + 1 == argc || !try_string_to_uint64(argv[++i], &lookahead)) { + platform_error_log( + "btree_test: failed to parse --prefetch-lookahead\n"); + platform_free(platform_get_heap_id(), filtered); + return STATUS_BAD_PARAM; + } + options->prefetch_lookahead = (uint32)lookahead; } else if (STRING_EQUALS_LITERAL(argv[i], "--random-scan-bounds") || STRING_EQUALS_LITERAL(argv[i], "--random-scan-starts")) { @@ -2087,6 +2105,8 @@ usage(const char *argv0) "for each scan\n"); platform_error_log("\t--memtable-scan-mode choose which memtable " "iterator mode(s) to benchmark (default both)\n"); + platform_error_log("\t--prefetch-lookahead extents to prefetch ahead " + "(0=off, 1=legacy single-extent, >=2 deep; default 0)\n"); config_usage(); } diff --git a/tests/functional/scan_benchmark.c b/tests/functional/scan_benchmark.c index 444acdb8..94554856 100644 --- a/tests/functional/scan_benchmark.c +++ b/tests/functional/scan_benchmark.c @@ -408,6 +408,7 @@ scan_benchmark_make_config(const master_config *master_cfg, .use_stats = master_cfg->use_stats, .reclaim_threshold = master_cfg->reclaim_threshold, .queue_scale_percent = master_cfg->queue_scale_percent, + .prefetch_budget = master_cfg->prefetch_budget, }; if (open_existing) { diff --git a/tests/functional/test.h b/tests/functional/test.h index 15fd7eb8..2cb2de05 100644 --- a/tests/functional/test.h +++ b/tests/functional/test.h @@ -305,6 +305,7 @@ test_config_init(system_config *system_cfg, // OUT (log_config *)&system_cfg->log_cfg, &system_cfg->trunk_node_cfg, master_cfg->queue_scale_percent, + master_cfg->prefetch_budget, master_cfg->use_log, master_cfg->use_stats, master_cfg->verbose_logging_enabled, diff --git a/tests/unit/btree_test.c b/tests/unit/btree_test.c index 35f59259..064932d1 100644 --- a/tests/unit/btree_test.c +++ b/tests/unit/btree_test.c @@ -188,8 +188,10 @@ leaf_hdr_tests(btree_config *cfg, btree_scratch *scratch, platform_heap_id hid) * about this number. If you change the size of a btree leaf header * or the size of a btree leafy entry, then this number will need * to be changed, and that's fine. + * (Reduced from 208 to 207 when btree_hdr gained the 8-byte + * meta_page_addr field used by the iterator prefetch cursor.) */ - int nkvs = 208; + int nkvs = 207; btree_init_hdr(cfg, hdr); From 52dda7922d1ee50959601482c86407f5e6f33e63 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Wed, 24 Jun 2026 11:57:11 -0700 Subject: [PATCH 02/15] more async init and prefetch ramp up Signed-off-by: Rob Johnson --- src/btree.c | 222 +++++++++++++++++++++++++++---------------- src/btree.h | 24 ++++- src/cache.h | 19 ++++ src/clockcache.c | 125 +++++++++++++++++++++++- src/mini_allocator.c | 29 ++++-- src/mini_allocator.h | 26 +++-- 6 files changed, 342 insertions(+), 103 deletions(-) diff --git a/src/btree.c b/src/btree.c index 235e1e91..c90ec4e1 100644 --- a/src/btree.c +++ b/src/btree.c @@ -2862,19 +2862,26 @@ btree_extent_base_addr(cache *cc, uint64 addr) } /* - * Issue prefetches until ~lookahead leaf extents are in flight, or we reach the + * Issue prefetches until ~depth leaf extents are in flight, or we reach the * iterator's end extent or the end of the extent stream. Leaf extents count - * toward the lookahead; blob extents in the window are prefetched but not - * counted; internal-node extents are skipped. + * toward the depth; blob extents in the window are prefetched but not counted; + * internal-node extents are skipped. Non-blocking: if a meta page needed to + * read further ahead isn't resident yet, fill stops early (a single-page + * prefetch for it was issued) and resumes on a later boundary. */ static void btree_prefetch_cursor_fill(btree_iterator *itor) { btree_prefetch_cursor *pf = &itor->prefetch; - while (!pf->at_end && pf->prefetched_ahead < pf->lookahead) { - uint64 extent_addr; - uint64 batch; - if (!mini_meta_cursor_next(&pf->meta_cursor, &extent_addr, &batch)) { + while (!pf->at_end && pf->prefetched_ahead < pf->depth) { + uint64 extent_addr; + uint64 batch; + mini_meta_cursor_status status = + mini_meta_cursor_next(&pf->meta_cursor, &extent_addr, &batch); + if (status == MINI_META_CURSOR_WOULD_BLOCK) { + break; // meta page not resident; prefetch issued, retry next boundary. + } + if (status == MINI_META_CURSOR_END) { pf->at_end = TRUE; break; } @@ -2893,52 +2900,55 @@ btree_prefetch_cursor_fill(btree_iterator *itor) } /* - * (Re)initialize the prefetch cursor at the iterator's current leaf and prime - * it. Falls back to disabled (legacy single-extent prefetch) when deep prefetch - * does not apply or the start extent cannot be located. + * Try to position the (PRIMING) cursor at the iterator's current leaf extent. + * Non-blocking: kicks off the meta-page IO (via mini_meta_cursor) and, if the + * page isn't resident yet, leaves the cursor PRIMING to be retried later. Reads + * the *current* leaf's meta_page_addr each call, so it positions correctly even + * if the iterator advanced across extents while priming. Returns TRUE iff the + * cursor just became ACTIVE (positioned and initial fill done). */ -static void -btree_prefetch_cursor_init(btree_iterator *itor) +static bool32 +btree_prefetch_cursor_pump(btree_iterator *itor) { btree_prefetch_cursor *pf = &itor->prefetch; - // Reset any previously-active cursor (e.g. on a seek). - if (pf->enabled) { - mini_meta_cursor_deinit(&pf->meta_cursor); - } - pf->enabled = FALSE; - pf->at_end = FALSE; - pf->prefetched_ahead = 0; - pf->leaf_batch = NUM_BLOB_BATCHES + itor->height; - pf->prefetch_blobs = (itor->height == 0); - - // Deep prefetch applies only to forward scans of finalized branches with a - // lookahead of 2+; everything else uses the legacy next_extent_addr path. - if (!itor->do_prefetch || pf->lookahead <= 1 - || itor->page_type != PAGE_TYPE_BRANCH || itor->curr.page == NULL) - { - return; - } - uint64 meta_page_addr = itor->curr.hdr->meta_page_addr; if (meta_page_addr == 0) { - return; // node predates the stamp, or a tiny tree: fall back to legacy. + // Node predates the stamp, or a tiny tree: fall back to legacy. + pf->state = BTREE_PREFETCH_DISABLED; + return FALSE; } + // (Re)anchor the meta cursor at the current leaf's meta page. + mini_meta_cursor_deinit(&pf->meta_cursor); mini_meta_cursor_init( &pf->meta_cursor, itor->cc, itor->page_type, meta_page_addr); - uint64 start_extent = btree_extent_base_addr(itor->cc, itor->curr.addr); - if (!mini_meta_cursor_seek_extent(&pf->meta_cursor, start_extent)) { + uint64 cur_extent = btree_extent_base_addr(itor->cc, itor->curr.addr); + mini_meta_cursor_status status = + mini_meta_cursor_seek_extent(&pf->meta_cursor, cur_extent); + if (status == MINI_META_CURSOR_WOULD_BLOCK) { + return FALSE; // still priming; meta-page prefetch issued by the seek. + } + if (status != MINI_META_CURSOR_ENTRY) { + // Couldn't locate our extent in this meta page: fall back to legacy. mini_meta_cursor_deinit(&pf->meta_cursor); - return; // couldn't locate our extent; fall back to legacy. + pf->state = BTREE_PREFETCH_DISABLED; + return FALSE; } - pf->enabled = TRUE; + + // Positioned. Activate and prime the (ramped) lookahead window. + pf->state = BTREE_PREFETCH_ACTIVE; + pf->at_end = FALSE; + pf->prefetched_ahead = 0; + pf->depth = BTREE_PREFETCH_RAMP_MIN; btree_prefetch_cursor_fill(itor); + return TRUE; } /* - * Called when the iterator crosses forward into a new leaf extent: account for - * the consumed extent and refill the lookahead window. + * Called when the iterator crosses forward into a new leaf extent while the + * cursor is ACTIVE: account for the consumed extent, ramp the depth up toward + * the configured cap, and refill the lookahead window. */ static void btree_prefetch_cursor_on_boundary(btree_iterator *itor) @@ -2947,17 +2957,92 @@ btree_prefetch_cursor_on_boundary(btree_iterator *itor) if (pf->prefetched_ahead > 0) { pf->prefetched_ahead--; } + // Ramp up (slow-start): the scan has proven longer, so read further ahead. + if (pf->depth < pf->lookahead) { + pf->depth *= 2; + if (pf->depth > pf->lookahead) { + pf->depth = pf->lookahead; + } + } btree_prefetch_cursor_fill(itor); } +/* + * (Re)start deep prefetch at the iterator's current leaf (used at init and on + * seek). Non-blocking: kicks off the meta-page IO and leaves the cursor PRIMING + * (to be completed lazily as the scan advances) unless the meta page is already + * resident, in which case it becomes ACTIVE immediately. Falls back to DISABLED + * (legacy single-extent prefetch) when deep prefetch does not apply. + */ +static void +btree_prefetch_cursor_start(btree_iterator *itor) +{ + btree_prefetch_cursor *pf = &itor->prefetch; + + // Reset any previously-active cursor (e.g. on a seek). + mini_meta_cursor_deinit(&pf->meta_cursor); + pf->state = BTREE_PREFETCH_DISABLED; + pf->at_end = FALSE; + pf->prefetched_ahead = 0; + pf->depth = BTREE_PREFETCH_RAMP_MIN; + pf->leaf_batch = NUM_BLOB_BATCHES + itor->height; + pf->prefetch_blobs = (itor->height == 0); + + // Deep prefetch applies only to forward scans of finalized branches with a + // lookahead of 2+; everything else uses the legacy next_extent_addr path. + if (!itor->do_prefetch || pf->lookahead <= 1 + || itor->page_type != PAGE_TYPE_BRANCH || itor->curr.page == NULL) + { + return; + } + + pf->state = BTREE_PREFETCH_PRIMING; + btree_prefetch_cursor_pump(itor); +} + /* Release the cursor's resources and turn it off. */ static void btree_prefetch_cursor_deinit(btree_iterator *itor) { btree_prefetch_cursor *pf = &itor->prefetch; - if (pf->enabled) { - mini_meta_cursor_deinit(&pf->meta_cursor); - pf->enabled = FALSE; + mini_meta_cursor_deinit(&pf->meta_cursor); + pf->state = BTREE_PREFETCH_DISABLED; +} + +/* + * Drive prefetching after the iterator advances one leaf (forward). Pumps the + * non-blocking prime while still PRIMING, then, on an extent-boundary crossing, + * either refills the deep window (ACTIVE) or issues the legacy single-extent + * prefetch (PRIMING/DISABLED). Safe to call on every leaf step; the meta page + * (whose IO was kicked off at init) typically lands during the first extent. + */ +static void +btree_iterator_prefetch_on_advance(btree_iterator *itor, uint64 last_addr) +{ + cache *cc = itor->cc; + btree_prefetch_cursor *pf = &itor->prefetch; + + bool32 positioned_now = FALSE; + if (pf->state == BTREE_PREFETCH_PRIMING) { + positioned_now = btree_prefetch_cursor_pump(itor); + } + + // Only act on prefetching when we just moved from one extent to the next. + if (btree_addrs_share_extent(cc, last_addr, itor->curr.addr)) { + return; + } + + if (pf->state == BTREE_PREFETCH_ACTIVE) { + // If the pump positioned us this round it already anchored at the current + // extent and filled, so don't also advance the window. + if (!positioned_now) { + btree_prefetch_cursor_on_boundary(itor); + } + } else if (itor->do_prefetch && itor->curr.hdr->next_extent_addr != 0 + && !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr)) + { + // Legacy single-extent-ahead prefetch (this can't be the last extent). + cache_prefetch(cc, itor->curr.hdr->next_extent_addr, itor->page_type); } } @@ -2970,8 +3055,6 @@ btree_prefetch_cursor_deinit(btree_iterator *itor) static void btree_iterator_next_leaf(btree_iterator *itor) { - cache *cc = itor->cc; - uint64 last_addr = itor->curr.addr; uint64 next_addr = itor->curr.hdr->next_addr; btree_iterator_release_curr(itor); @@ -2979,19 +3062,7 @@ btree_iterator_next_leaf(btree_iterator *itor) itor->idx = 0; itor->curr_min_idx = -1; - // Only act when we just moved from one extent to the next. - if (!btree_addrs_share_extent(cc, last_addr, itor->curr.addr)) { - if (itor->prefetch.enabled) { - // Deep prefetch: refill the lookahead window of leaf extents. - btree_prefetch_cursor_on_boundary(itor); - } else if (itor->do_prefetch && itor->curr.hdr->next_extent_addr != 0 - && !btree_addrs_share_extent( - cc, itor->curr.addr, itor->end_addr)) - { - // Legacy single-extent-ahead prefetch (this can't be the last extent). - cache_prefetch(cc, itor->curr.hdr->next_extent_addr, itor->page_type); - } - } + btree_iterator_prefetch_on_advance(itor, last_addr); } static async_status @@ -3020,25 +3091,9 @@ btree_iterator_next_leaf_async(btree_iterator_async_state *state, uint64 depth) state->itor->idx = 0; state->itor->curr_min_idx = -1; - // Only act when we just moved from one extent to the next. - if (!btree_addrs_share_extent( - state->itor->cc, state->last_addr, state->itor->curr.addr)) - { - if (state->itor->prefetch.enabled) { - // Deep prefetch: refill the lookahead window of leaf extents. - btree_prefetch_cursor_on_boundary(state->itor); - } else if (state->itor->do_prefetch - && state->itor->curr.hdr->next_extent_addr != 0 - && !btree_addrs_share_extent(state->itor->cc, - state->itor->curr.addr, - state->itor->end_addr)) - { - // Legacy single-extent-ahead prefetch (this can't be the last extent). - cache_prefetch(state->itor->cc, - state->itor->curr.hdr->next_extent_addr, - state->itor->page_type); - } - } + // Prefetching (pump + boundary refill / legacy) is all non-blocking, so it + // needs no awaits here. + btree_iterator_prefetch_on_advance(state->itor, state->last_addr); async_return(state); } @@ -3524,7 +3579,7 @@ btree_iterator_seek(iterator *base_itor, comparison seek_type, key seek_key) // The iterator may have repositioned; re-anchor the prefetch cursor so a // subsequent forward scan prefetches from the new location. - btree_prefetch_cursor_init(itor); + btree_prefetch_cursor_start(itor); return STATUS_OK; } @@ -3675,12 +3730,13 @@ btree_iterator_init(cache *cc, find_btree_node_and_get_idx_bounds(itor, start_key, start_type); - btree_prefetch_cursor_init(itor); - if (!itor->prefetch.enabled && itor->do_prefetch + btree_prefetch_cursor_start(itor); + // While the deep cursor is priming (or disabled), cover the next extent with + // the legacy single-extent-ahead prefetch via next_extent_addr. + if (itor->prefetch.state != BTREE_PREFETCH_ACTIVE && itor->do_prefetch && itor->curr.hdr->next_extent_addr != 0 && !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr)) { - // Legacy single-extent-ahead prefetch (deep cursor not in use). cache_prefetch(cc, itor->curr.hdr->next_extent_addr, itor->page_type); } @@ -3698,7 +3754,7 @@ btree_iterator_set_prefetch_lookahead(btree_iterator *itor, // Re-anchor the cursor at the current position with the new lookahead. This // also issues an initial prefetch (replacing the legacy one-extent prefetch // from init when the cursor engages). - btree_prefetch_cursor_init(itor); + btree_prefetch_cursor_start(itor); } async_status @@ -3729,13 +3785,15 @@ btree_iterator_init_async(btree_iterator_async_state *state) async_await_subroutine(state, find_btree_node_and_get_idx_bounds_async); btree_iterator_copy_curr_if_needed(state->itor); - btree_prefetch_cursor_init(state->itor); - if (!state->itor->prefetch.enabled && state->itor->do_prefetch + btree_prefetch_cursor_start(state->itor); + // While the deep cursor is priming (or disabled), cover the next extent with + // the legacy single-extent-ahead prefetch via next_extent_addr. + if (state->itor->prefetch.state != BTREE_PREFETCH_ACTIVE + && state->itor->do_prefetch && state->itor->curr.hdr->next_extent_addr != 0 && !btree_addrs_share_extent( state->cc, state->itor->curr.addr, state->itor->end_addr)) { - // Legacy single-extent-ahead prefetch (deep cursor not in use). cache_prefetch(state->cc, state->itor->curr.hdr->next_extent_addr, state->itor->page_type); diff --git a/src/btree.h b/src/btree.h index 602a6b8b..6456bf07 100644 --- a/src/btree.h +++ b/src/btree.h @@ -134,14 +134,32 @@ typedef struct ONDISK btree_pivot_data { * Drives extent prefetching for a forward btree_iterator. Reads extent * addresses ahead of the iterator from the branch's mini_allocator (via a * mini_meta_cursor, exploiting that extents within a batch are in key order) - * and issues cache_prefetch for them, keeping ~lookahead leaf extents of IO in + * and issues cache_prefetch for them, keeping up to ~depth leaf extents of IO in * flight. Internal-node extents are skipped; blob extents are prefetched (for * height-0 scans). Forward-only; disabled on backward moves. + * + * Priming is non-blocking: the cursor's meta page is fetched lazily (PRIMING + * state) so the iterator's async init never waits on it and the first tuple is + * not delayed. The legacy single-extent-ahead prefetch (via the leaf's + * next_extent_addr) covers the window until the cursor becomes ACTIVE. + * + * Depth ramps up (slow-start) from BTREE_PREFETCH_RAMP_MIN toward `lookahead` as + * the scan proves long, so short scans don't waste bandwidth reading far ahead. */ +typedef enum btree_prefetch_state { + BTREE_PREFETCH_DISABLED = 0, // legacy next_extent_addr path / not applicable + BTREE_PREFETCH_PRIMING, // meta-page IO kicked off; not yet positioned + BTREE_PREFETCH_ACTIVE, // positioned; issuing deep prefetches +} btree_prefetch_state; + +// Initial (and minimum) ramp-up depth; depth doubles toward `lookahead`. +#define BTREE_PREFETCH_RAMP_MIN (1) + typedef struct btree_prefetch_cursor { - bool32 enabled; + btree_prefetch_state state; bool32 at_end; // prefetched through the last in-range extent - uint32 lookahead; // K: target leaf extents in flight + uint32 lookahead; // K: max leaf extents in flight (the cap) + uint32 depth; // current ramp-up depth (<= lookahead) uint64 leaf_batch; // mini batch of this iterator's level bool32 prefetch_blobs; // also prefetch blob extents (height 0) uint64 prefetched_ahead; // leaf extents prefetched, not yet consumed diff --git a/src/cache.h b/src/cache.h index 9e638aea..bc239b7a 100644 --- a/src/cache.h +++ b/src/cache.h @@ -169,6 +169,7 @@ typedef struct cache_ops { page_generic_fn page_lock; page_generic_fn page_unlock; page_prefetch_fn page_prefetch; + page_prefetch_fn page_prefetch_page; page_generic_fn page_mark_dirty; page_generic_fn page_pin; page_generic_fn page_unpin; @@ -408,6 +409,24 @@ cache_prefetch(cache *cc, uint64 addr, page_type type) return cc->ops->page_prefetch(cc, addr, type); } +/* + *---------------------------------------------------------------------- + * cache_prefetch_page + * + * Like cache_prefetch, but loads only the single page at addr rather than the + * whole extent that contains it. Use this for sparse reads (e.g. a single + * mini_allocator meta page) where pulling in the surrounding extent would waste + * bandwidth. No notification is provided to the calling thread; it may call + * cache_get when it's ready to block on the arrival of the page. + * + *---------------------------------------------------------------------- + */ +static inline void +cache_prefetch_page(cache *cc, uint64 addr, page_type type) +{ + return cc->ops->page_prefetch_page(cc, addr, type); +} + /* *---------------------------------------------------------------------- * cache_mark_dirty diff --git a/src/clockcache.c b/src/clockcache.c index d9585733..85beccf7 100644 --- a/src/clockcache.c +++ b/src/clockcache.c @@ -1805,7 +1805,10 @@ page_handle * clockcache_get(clockcache *cc, uint64 addr, bool32 blocking, page_type type) { bool32 retry; - page_handle *handle; + // Initialize to NULL so a non-blocking get of a page that is not in cache + // (clockcache_get_internal returns without setting handle) honors the + // documented contract of returning NULL rather than an uninitialized value. + page_handle *handle = NULL; debug_assert(cc->per_thread[platform_get_tid()].enable_sync_get || type == PAGE_TYPE_MEMTABLE); @@ -2667,6 +2670,118 @@ clockcache_prefetch(clockcache *cc, uint64 base_addr, page_type type) } } +/* + *----------------------------------------------------------------------------- + * clockcache_prefetch_page -- + * + * Like clockcache_prefetch, but loads only the single page at addr instead + * of its whole extent. Used for sparse reads (e.g. a mini_allocator meta + * page) where dragging in the surrounding extent would waste bandwidth. + *----------------------------------------------------------------------------- + */ +void +clockcache_prefetch_page(clockcache *cc, uint64 addr, page_type type) +{ + threadid tid = platform_get_tid(); + + debug_assert(addr % clockcache_page_size(cc) == 0); + + while (TRUE) { + uint32 entry_no = clockcache_lookup(cc, addr); + get_rc get_read_rc; + if (entry_no != CC_UNMAPPED_ENTRY) { + get_read_rc = clockcache_try_get_read(cc, entry_no, TRUE); + } else { + get_read_rc = GET_RC_EVICTED; + } + + switch (get_read_rc) { + case GET_RC_SUCCESS: + // already resident: drop the ref we just took and we're done. + clockcache_dec_ref(cc, entry_no, tid); + return; + case GET_RC_CONFLICT: + // someone else is loading or has it locked: nothing to do. + return; + case GET_RC_EVICTED: + { + // not in cache: load just this page. + uint32 free_entry_no = clockcache_get_free_page( + cc, CC_READ_LOADING_STATUS, type, FALSE, TRUE); + clockcache_entry *entry = &cc->entry[free_entry_no]; + entry->page.disk_addr = addr; + entry->type = type; + uint64 lookup_no = clockcache_divide_by_page_size(cc, addr); + + async_io_state *state = TYPED_MALLOC(PROCESS_PRIVATE_HEAP_ID, state); + if (state == NULL) { + platform_error_log("clockcache_prefetch_page: async_io_state " + "allocation failed for page addr %lu, " + "type %u\n", + addr, + type); + clockcache_release_unpublished_entry(entry); + return; + } + state->cc = cc; + platform_status rc = io_async_state_init(state->iostate, + cc->io, + io_async_preadv, + addr, + clockcache_prefetch_callback, + state); + if (!SUCCESS(rc)) { + platform_error_log("clockcache_prefetch_page: " + "io_async_state_init failed for page addr " + "%lu, type %u: %s\n", + addr, + type, + platform_status_to_string(rc)); + clockcache_release_unpublished_entry(entry); + platform_free(PROCESS_PRIVATE_HEAP_ID, state); + return; + } + + if (__sync_bool_compare_and_swap( + &cc->lookup[lookup_no], CC_UNMAPPED_ENTRY, free_entry_no)) + { + rc = io_async_state_append_page(state->iostate, + entry->page.data); + if (!SUCCESS(rc)) { + platform_error_log("clockcache_prefetch_page: " + "io_async_state_append_page failed for " + "page addr %lu, entry %u, type %u: %s\n", + addr, + free_entry_no, + type, + platform_status_to_string(rc)); + } + platform_assert_status_ok(rc); + if (cc->cfg->use_stats) { + cc->stats[tid].page_reads[type]++; + cc->stats[tid].prefetches_issued[type]++; + } + clockcache_log(addr, + free_entry_no, + "prefetch_page (load): entry %u addr %lu\n", + free_entry_no, + addr); + io_async_run(state->iostate); + return; + } else { + // someone else started loading this page: release and retry. + clockcache_release_unpublished_entry(entry); + io_async_state_deinit(state->iostate); + platform_free(PROCESS_PRIVATE_HEAP_ID, state); + continue; + } + } + default: + platform_assert(0); + } + } +} + /* *---------------------------------------------------------------------- * clockcache_print -- @@ -3044,6 +3159,13 @@ clockcache_prefetch_virtual(cache *c, uint64 addr, page_type type) clockcache_prefetch(cc, addr, type); } +void +clockcache_prefetch_page_virtual(cache *c, uint64 addr, page_type type) +{ + clockcache *cc = (clockcache *)c; + clockcache_prefetch_page(cc, addr, type); +} + void clockcache_mark_dirty_virtual(cache *c, page_handle *page) { @@ -3245,6 +3367,7 @@ static cache_ops clockcache_ops = { .page_lock = clockcache_lock_virtual, .page_unlock = clockcache_unlock_virtual, .page_prefetch = clockcache_prefetch_virtual, + .page_prefetch_page = clockcache_prefetch_page_virtual, .page_mark_dirty = clockcache_mark_dirty_virtual, .page_pin = clockcache_pin_virtual, .page_unpin = clockcache_unpin_virtual, diff --git a/src/mini_allocator.c b/src/mini_allocator.c index 51cb7bc3..3d584be4 100644 --- a/src/mini_allocator.c +++ b/src/mini_allocator.c @@ -933,7 +933,7 @@ mini_meta_cursor_deinit(mini_meta_cursor *cursor) } } -bool32 +mini_meta_cursor_status mini_meta_cursor_next(mini_meta_cursor *cursor, uint64 *extent_addr, uint64 *batch) @@ -941,10 +941,17 @@ mini_meta_cursor_next(mini_meta_cursor *cursor, while (TRUE) { if (cursor->meta_page == NULL) { if (cursor->meta_addr == 0) { - return FALSE; + return MINI_META_CURSOR_END; + } + // Non-blocking: if the meta page isn't resident, kick off a single-page + // prefetch and let the caller retry later. + cursor->meta_page = cache_get( + cursor->cc, cursor->meta_addr, FALSE, cursor->meta_type); + if (cursor->meta_page == NULL) { + cache_prefetch_page( + cursor->cc, cursor->meta_addr, cursor->meta_type); + return MINI_META_CURSOR_WOULD_BLOCK; } - cursor->meta_page = - cache_get(cursor->cc, cursor->meta_addr, TRUE, cursor->meta_type); cursor->num_entries = mini_num_entries(cursor->meta_page); cursor->entry_idx = 0; } @@ -954,7 +961,7 @@ mini_meta_cursor_next(mini_meta_cursor *cursor, *extent_addr = meta_entry_extent_addr(cursor->cc, entry); *batch = meta_entry_batch(entry); cursor->entry_idx++; - return TRUE; + return MINI_META_CURSOR_ENTRY; } // Exhausted this page; advance to the next one (if any). @@ -965,18 +972,22 @@ mini_meta_cursor_next(mini_meta_cursor *cursor, } } -bool32 +mini_meta_cursor_status mini_meta_cursor_seek_extent(mini_meta_cursor *cursor, uint64 target_extent_addr) { uint64 extent_addr; uint64 batch; - while (mini_meta_cursor_next(cursor, &extent_addr, &batch)) { + while (TRUE) { + mini_meta_cursor_status status = + mini_meta_cursor_next(cursor, &extent_addr, &batch); + if (status != MINI_META_CURSOR_ENTRY) { + return status; // END or WOULD_BLOCK + } if (extent_addr == target_extent_addr) { - return TRUE; + return MINI_META_CURSOR_ENTRY; } } - return FALSE; } static void diff --git a/src/mini_allocator.h b/src/mini_allocator.h index 33a02699..944a7e66 100644 --- a/src/mini_allocator.h +++ b/src/mini_allocator.h @@ -121,9 +121,10 @@ mini_prefetch(cache *cc, page_type type, uint64 meta_head); * ahead of itself for prefetching. * * The cursor holds a read reference on the meta page it is currently reading; - * call mini_meta_cursor_deinit() to release it. The cursor reads meta pages - * with blocking cache_get(); meta pages are tiny and become hot quickly, so a - * miss is rare, but callers on async paths should be aware it can block. + * call mini_meta_cursor_deinit() to release it. The cursor is non-blocking: it + * reads meta pages with a non-blocking cache_get() and, on a miss, issues a + * single-page prefetch and reports MINI_META_CURSOR_WOULD_BLOCK so the caller + * can do other work and retry later (the meta page lands shortly). */ typedef struct mini_meta_cursor { cache *cc; @@ -134,6 +135,13 @@ typedef struct mini_meta_cursor { uint64 num_entries; // number of entries on meta_page } mini_meta_cursor; +// Result of a non-blocking cursor step. +typedef enum mini_meta_cursor_status { + MINI_META_CURSOR_ENTRY, // produced an entry + MINI_META_CURSOR_END, // stream exhausted + MINI_META_CURSOR_WOULD_BLOCK, // next meta page not resident (prefetch issued) +} mini_meta_cursor_status; + void mini_meta_cursor_init(mini_meta_cursor *cursor, cache *cc, @@ -144,16 +152,18 @@ void mini_meta_cursor_deinit(mini_meta_cursor *cursor); // Emit the next extent entry (its extent address and originating batch) in -// allocation order. Returns FALSE once the stream is exhausted. -bool32 +// allocation order. Non-blocking: returns MINI_META_CURSOR_WOULD_BLOCK (and +// issues a prefetch for it) if the next meta page is not yet resident. +mini_meta_cursor_status mini_meta_cursor_next(mini_meta_cursor *cursor, uint64 *extent_addr, uint64 *batch); // Advance the cursor until it emits the entry for target_extent_addr, leaving -// the cursor positioned just after it. Returns FALSE if not found before the -// stream ends. -bool32 +// the cursor positioned just after it. Returns MINI_META_CURSOR_ENTRY if found, +// MINI_META_CURSOR_END if the stream ends first, or MINI_META_CURSOR_WOULD_BLOCK +// if a needed meta page is not yet resident. +mini_meta_cursor_status mini_meta_cursor_seek_extent(mini_meta_cursor *cursor, uint64 target_extent_addr); From a2557e64ecf04058f4e82dfb6009204202bd1b5e Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Thu, 25 Jun 2026 11:41:11 -0700 Subject: [PATCH 03/15] Add deep prefetch for compaction and bidirectional range scans Compaction (trunk_branch_merger): thread prefetch_budget from splinterdb_config through trunk_config into build_merge_itor, which divides the budget across branches (max(2, budget/extent_size/N)) and calls btree_iterator_set_prefetch_lookahead on each. Measured ~1.4x faster full-leaf optimize vs the notification-fixes baseline. Backward scan prefetch: add mini_meta_cursor_prev() which walks the doubly-linked meta-page list in reverse, keeping the current page alive during WOULD_BLOCK so prev_meta_addr stays accessible for retry (unlike the forward cursor which releases on miss). Add fill_backward, pump_backward, on_boundary_backward, and start_backward symmetric to the forward equivalents. btree_iterator_prefetch_on_advance now takes a going_forward parameter; on a direction change it restarts the cursor (resetting the ramp) so both directions get the same slow-start treatment. btree_iterator_prev_leaf/async now call prefetch_on_advance rather than killing the cursor. Measured ~40x improvement for backward scans (baseline had zero prefetch -- prev_leaf was calling btree_prefetch_cursor_deinit on every leaf step). Also add --optimize-only mode to scan_benchmark for measuring cold compaction throughput. Validated: unit_test btree 5/5, splinterdb_quick 33/33 (release + ASAN); backward scan returns correct tuple count; no ASAN errors. Co-Authored-By: Claude Sonnet 4.6 --- src/btree.c | 265 ++++++++++++++++++++++-------- src/btree.h | 16 +- src/mini_allocator.c | 43 +++++ src/mini_allocator.h | 11 ++ src/splinterdb.c | 1 + src/trunk.c | 42 ++++- src/trunk.h | 4 + tests/functional/scan_benchmark.c | 55 ++++++- tests/functional/test.h | 1 + 9 files changed, 348 insertions(+), 90 deletions(-) diff --git a/src/btree.c b/src/btree.c index c90ec4e1..57f391dd 100644 --- a/src/btree.c +++ b/src/btree.c @@ -2845,13 +2845,14 @@ btree_iterator_find_end_addr_async(btree_iterator_async_state *state, * ---------------------------------------------------------------------------- * btree_prefetch_cursor -- * - * Forward extent-prefetcher for a btree_iterator. It reads extent addresses - * ahead of the iterator from the branch's mini_allocator (via a - * mini_meta_cursor) and issues cache_prefetch for them, keeping ~lookahead - * leaf extents of IO in flight. Within the iterator's level (batch), the - * extents are in key order, so the cursor advances in lockstep with - * consumption. Internal-node extents are skipped; blob extents are - * prefetched for height-0 scans. See btree_prefetch_cursor in btree.h. + * Bidirectional extent-prefetcher for a btree_iterator. It reads extent + * addresses ahead of (or behind) the iterator from the branch's + * mini_allocator (via a mini_meta_cursor) and issues cache_prefetch for + * them, keeping ~lookahead leaf extents of IO in flight. Within the + * iterator's level (batch), the extents are in key order, so the cursor + * advances in lockstep with consumption. Internal-node extents are skipped; + * blob extents are prefetched for height-0 scans. See btree_prefetch_cursor + * in btree.h. * ---------------------------------------------------------------------------- */ static inline uint64 @@ -2936,7 +2937,8 @@ btree_prefetch_cursor_pump(btree_iterator *itor) return FALSE; } - // Positioned. Activate and prime the (ramped) lookahead window. + // Positioned. Activate and prime the lookahead window. Ramp up from RAMP_MIN + // to avoid wasting bandwidth on short scans. pf->state = BTREE_PREFETCH_ACTIVE; pf->at_end = FALSE; pf->prefetched_ahead = 0; @@ -2979,17 +2981,18 @@ btree_prefetch_cursor_start(btree_iterator *itor) { btree_prefetch_cursor *pf = &itor->prefetch; - // Reset any previously-active cursor (e.g. on a seek). + // Reset any previously-active cursor (e.g. on a direction change or seek). mini_meta_cursor_deinit(&pf->meta_cursor); pf->state = BTREE_PREFETCH_DISABLED; + pf->going_forward = TRUE; pf->at_end = FALSE; pf->prefetched_ahead = 0; pf->depth = BTREE_PREFETCH_RAMP_MIN; pf->leaf_batch = NUM_BLOB_BATCHES + itor->height; pf->prefetch_blobs = (itor->height == 0); - // Deep prefetch applies only to forward scans of finalized branches with a - // lookahead of 2+; everything else uses the legacy next_extent_addr path. + // Deep prefetch applies only to finalized branches with a lookahead of 2+; + // everything else uses the legacy next_extent_addr path. if (!itor->do_prefetch || pf->lookahead <= 1 || itor->page_type != PAGE_TYPE_BRANCH || itor->curr.page == NULL) { @@ -3000,6 +3003,135 @@ btree_prefetch_cursor_start(btree_iterator *itor) btree_prefetch_cursor_pump(itor); } +/* + * Issue prefetches backward until ~depth leaf extents are in flight, or the + * stream is exhausted. Non-blocking: stops if a meta page isn't resident yet + * (the WOULD_BLOCK from cursor_prev; a prefetch was already issued). + */ +static void +btree_prefetch_cursor_fill_backward(btree_iterator *itor) +{ + btree_prefetch_cursor *pf = &itor->prefetch; + while (!pf->at_end && pf->prefetched_ahead < pf->depth) { + uint64 extent_addr; + uint64 batch; + mini_meta_cursor_status status = + mini_meta_cursor_prev(&pf->meta_cursor, &extent_addr, &batch); + if (status == MINI_META_CURSOR_WOULD_BLOCK) { + break; + } + if (status == MINI_META_CURSOR_END) { + pf->at_end = TRUE; + break; + } + if (batch == pf->leaf_batch) { + cache_prefetch(itor->cc, extent_addr, itor->page_type); + pf->prefetched_ahead++; + } else if (pf->prefetch_blobs && batch < NUM_BLOB_BATCHES) { + cache_prefetch(itor->cc, extent_addr, PAGE_TYPE_BLOB); + } + // else: internal-node extent -- skip. + } +} + +/* + * Try to position the (PRIMING) backward cursor at the iterator's current leaf. + * Re-anchors via a forward seek (from meta_page_addr to the current extent), + * then consumes the current extent with one cursor_prev so that subsequent + * fill_backward calls emit extents before the current one. Non-blocking: returns + * FALSE and stays PRIMING if any meta page is not yet resident. + */ +static bool32 +btree_prefetch_cursor_pump_backward(btree_iterator *itor) +{ + btree_prefetch_cursor *pf = &itor->prefetch; + + uint64 meta_page_addr = itor->curr.hdr->meta_page_addr; + if (meta_page_addr == 0) { + pf->state = BTREE_PREFETCH_DISABLED; + return FALSE; + } + + // Re-anchor: start a fresh forward seek from the current leaf's meta page. + mini_meta_cursor_deinit(&pf->meta_cursor); + mini_meta_cursor_init( + &pf->meta_cursor, itor->cc, itor->page_type, meta_page_addr); + uint64 cur_extent = btree_extent_base_addr(itor->cc, itor->curr.addr); + mini_meta_cursor_status status = + mini_meta_cursor_seek_extent(&pf->meta_cursor, cur_extent); + if (status == MINI_META_CURSOR_WOULD_BLOCK) { + return FALSE; + } + if (status != MINI_META_CURSOR_ENTRY) { + mini_meta_cursor_deinit(&pf->meta_cursor); + pf->state = BTREE_PREFETCH_DISABLED; + return FALSE; + } + + // Cursor is now just past cur_extent. Wind back one entry to consume + // cur_extent: after seek, entry_idx >= 1, so cursor_prev always succeeds. + uint64 ignored_addr, ignored_batch; + status = mini_meta_cursor_prev(&pf->meta_cursor, &ignored_addr, &ignored_batch); + platform_assert(status == MINI_META_CURSOR_ENTRY); + + // Activated. Prime the backward lookahead window. + pf->state = BTREE_PREFETCH_ACTIVE; + pf->at_end = FALSE; + pf->prefetched_ahead = 0; + pf->depth = BTREE_PREFETCH_RAMP_MIN; + btree_prefetch_cursor_fill_backward(itor); + return TRUE; +} + +/* + * Called when the iterator crosses backward into a new leaf extent while the + * cursor is ACTIVE: account for the consumed extent, ramp the depth, refill. + */ +static void +btree_prefetch_cursor_on_boundary_backward(btree_iterator *itor) +{ + btree_prefetch_cursor *pf = &itor->prefetch; + if (pf->prefetched_ahead > 0) { + pf->prefetched_ahead--; + } + if (pf->depth < pf->lookahead) { + pf->depth *= 2; + if (pf->depth > pf->lookahead) { + pf->depth = pf->lookahead; + } + } + btree_prefetch_cursor_fill_backward(itor); +} + +/* + * (Re)start backward deep prefetch at the iterator's current leaf. Non-blocking: + * kicks off the meta-page IO and leaves the cursor PRIMING (to be completed + * lazily as the scan advances) unless the meta page is already resident. + */ +static void +btree_prefetch_cursor_start_backward(btree_iterator *itor) +{ + btree_prefetch_cursor *pf = &itor->prefetch; + + mini_meta_cursor_deinit(&pf->meta_cursor); + pf->state = BTREE_PREFETCH_DISABLED; + pf->going_forward = FALSE; + pf->at_end = FALSE; + pf->prefetched_ahead = 0; + pf->depth = BTREE_PREFETCH_RAMP_MIN; + pf->leaf_batch = NUM_BLOB_BATCHES + itor->height; + pf->prefetch_blobs = (itor->height == 0); + + if (!itor->do_prefetch || pf->lookahead <= 1 + || itor->page_type != PAGE_TYPE_BRANCH || itor->curr.page == NULL) + { + return; + } + + pf->state = BTREE_PREFETCH_PRIMING; + btree_prefetch_cursor_pump_backward(itor); +} + /* Release the cursor's resources and turn it off. */ static void btree_prefetch_cursor_deinit(btree_iterator *itor) @@ -3010,39 +3142,59 @@ btree_prefetch_cursor_deinit(btree_iterator *itor) } /* - * Drive prefetching after the iterator advances one leaf (forward). Pumps the - * non-blocking prime while still PRIMING, then, on an extent-boundary crossing, - * either refills the deep window (ACTIVE) or issues the legacy single-extent - * prefetch (PRIMING/DISABLED). Safe to call on every leaf step; the meta page - * (whose IO was kicked off at init) typically lands during the first extent. + * Drive prefetching after the iterator advances one leaf, in either direction. + * On a direction change, restarts the cursor in the new direction (resetting + * the ramp). Otherwise pumps the non-blocking prime while PRIMING, then, on an + * extent-boundary crossing, refills the deep window (ACTIVE) or issues the + * legacy single-extent prefetch (forward only; no prev_extent_addr in headers). */ static void -btree_iterator_prefetch_on_advance(btree_iterator *itor, uint64 last_addr) +btree_iterator_prefetch_on_advance(btree_iterator *itor, + uint64 last_addr, + bool32 going_forward) { cache *cc = itor->cc; btree_prefetch_cursor *pf = &itor->prefetch; - bool32 positioned_now = FALSE; - if (pf->state == BTREE_PREFETCH_PRIMING) { - positioned_now = btree_prefetch_cursor_pump(itor); - } - - // Only act on prefetching when we just moved from one extent to the next. - if (btree_addrs_share_extent(cc, last_addr, itor->curr.addr)) { + // Direction change: restart cursor in the new direction, resetting ramp. + if (pf->state != BTREE_PREFETCH_DISABLED && pf->going_forward != going_forward) { + if (going_forward) { + btree_prefetch_cursor_start(itor); + } else { + btree_prefetch_cursor_start_backward(itor); + } return; } - if (pf->state == BTREE_PREFETCH_ACTIVE) { - // If the pump positioned us this round it already anchored at the current - // extent and filled, so don't also advance the window. - if (!positioned_now) { - btree_prefetch_cursor_on_boundary(itor); + if (going_forward) { + bool32 positioned_now = FALSE; + if (pf->state == BTREE_PREFETCH_PRIMING) { + positioned_now = btree_prefetch_cursor_pump(itor); } - } else if (itor->do_prefetch && itor->curr.hdr->next_extent_addr != 0 - && !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr)) - { - // Legacy single-extent-ahead prefetch (this can't be the last extent). - cache_prefetch(cc, itor->curr.hdr->next_extent_addr, itor->page_type); + if (btree_addrs_share_extent(cc, last_addr, itor->curr.addr)) { + return; + } + if (pf->state == BTREE_PREFETCH_ACTIVE) { + if (!positioned_now) { + btree_prefetch_cursor_on_boundary(itor); + } + } else if (itor->do_prefetch && itor->curr.hdr->next_extent_addr != 0 + && !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr)) + { + cache_prefetch(cc, itor->curr.hdr->next_extent_addr, itor->page_type); + } + } else { + bool32 positioned_now = FALSE; + if (pf->state == BTREE_PREFETCH_PRIMING) { + positioned_now = btree_prefetch_cursor_pump_backward(itor); + } + if (btree_addrs_share_extent(cc, last_addr, itor->curr.addr)) { + return; + } + if (pf->state == BTREE_PREFETCH_ACTIVE && !positioned_now) { + btree_prefetch_cursor_on_boundary_backward(itor); + } + // No legacy backward prefetch: leaf headers have no prev_extent_addr. } } @@ -3062,7 +3214,7 @@ btree_iterator_next_leaf(btree_iterator *itor) itor->idx = 0; itor->curr_min_idx = -1; - btree_iterator_prefetch_on_advance(itor, last_addr); + btree_iterator_prefetch_on_advance(itor, last_addr, TRUE); } static async_status @@ -3093,7 +3245,7 @@ btree_iterator_next_leaf_async(btree_iterator_async_state *state, uint64 depth) // Prefetching (pump + boundary refill / legacy) is all non-blocking, so it // needs no awaits here. - btree_iterator_prefetch_on_advance(state->itor, state->last_addr); + btree_iterator_prefetch_on_advance(state->itor, state->last_addr, TRUE); async_return(state); } @@ -3108,10 +3260,7 @@ btree_iterator_prev_leaf(btree_iterator *itor) { const btree_config *cfg = itor->cfg; - // We don't prefetch backward; turn off any forward prefetch cursor. - btree_prefetch_cursor_deinit(itor); - - debug_only uint64 curr_addr = itor->curr.addr; + uint64 last_addr = itor->curr.addr; /* * Copied nodes can have stale prev_addr values. Read the live current node * before moving backward so predecessor splits are not skipped. @@ -3125,7 +3274,7 @@ btree_iterator_prev_leaf(btree_iterator *itor) * old curr node and the new one. In this case, we can just walk * forward until we find the leaf whose successor is our old leaf. */ - while (itor->curr.hdr->next_addr != curr_addr) { + while (itor->curr.hdr->next_addr != last_addr) { uint64 next_addr = itor->curr.hdr->next_addr; btree_iterator_release_curr(itor); btree_iterator_get_curr_addr(itor, next_addr); @@ -3146,20 +3295,7 @@ btree_iterator_prev_leaf(btree_iterator *itor) itor->curr_min_idx = 0; } - // FIXME: To prefetch: - // 1. we just moved from one extent to the next - // 2. this can't be the last extent - /* if (itor->do_prefetch */ - /* && !btree_addrs_share_extent(cc, last_addr, itor->curr.addr) */ - /* && itor->curr.hdr->next_extent_addr != 0 */ - /* && !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr)) - */ - /* { */ - /* // IO prefetch the next extent */ - /* cache_prefetch(cc, itor->curr.hdr->next_extent_addr, - * itor->page_type); - */ - /* } */ + btree_iterator_prefetch_on_advance(itor, last_addr, FALSE); } static async_status @@ -3167,9 +3303,6 @@ btree_iterator_prev_leaf_async(btree_iterator_async_state *state, uint64 depth) { async_begin(state, depth); - // We don't prefetch backward; turn off any forward prefetch cursor. - btree_prefetch_cursor_deinit(state->itor); - state->curr_addr = state->itor->curr.addr; if (btree_iterator_curr_is_copy(state->itor)) { state->live_curr.addr = state->curr_addr; @@ -3254,20 +3387,8 @@ btree_iterator_prev_leaf_async(btree_iterator_async_state *state, uint64 depth) state->itor->curr_min_idx = 0; } - // FIXME: To prefetch: - // 1. we just moved from one extent to the next - // 2. this can't be the last extent - /* if (itor->do_prefetch */ - /* && !btree_addrs_share_extent(cc, last_addr, itor->curr.addr) */ - /* && itor->curr.hdr->next_extent_addr != 0 */ - /* && !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr)) - */ - /* { */ - /* // IO prefetch the next extent */ - /* cache_prefetch(cc, itor->curr.hdr->next_extent_addr, - * itor->page_type); - */ - /* } */ + // Prefetching is non-blocking, so no awaits needed here. + btree_iterator_prefetch_on_advance(state->itor, state->curr_addr, FALSE); async_return(state); } diff --git a/src/btree.h b/src/btree.h index 6456bf07..fef42257 100644 --- a/src/btree.h +++ b/src/btree.h @@ -131,12 +131,12 @@ typedef struct ONDISK btree_pivot_data { } btree_pivot_data; /* - * Drives extent prefetching for a forward btree_iterator. Reads extent - * addresses ahead of the iterator from the branch's mini_allocator (via a - * mini_meta_cursor, exploiting that extents within a batch are in key order) - * and issues cache_prefetch for them, keeping up to ~depth leaf extents of IO in - * flight. Internal-node extents are skipped; blob extents are prefetched (for - * height-0 scans). Forward-only; disabled on backward moves. + * Drives extent prefetching for a btree_iterator in either direction. Reads + * extent addresses ahead of (or behind) the iterator from the branch's + * mini_allocator (via a mini_meta_cursor, exploiting that extents within a batch + * are in key order) and issues cache_prefetch for them, keeping up to ~depth leaf + * extents of IO in flight. Internal-node extents are skipped; blob extents are + * prefetched (for height-0 scans). * * Priming is non-blocking: the cursor's meta page is fetched lazily (PRIMING * state) so the iterator's async init never waits on it and the first tuple is @@ -145,6 +145,8 @@ typedef struct ONDISK btree_pivot_data { * * Depth ramps up (slow-start) from BTREE_PREFETCH_RAMP_MIN toward `lookahead` as * the scan proves long, so short scans don't waste bandwidth reading far ahead. + * On a direction change the ramp resets so both forward and backward scans get + * the same slow-start treatment. */ typedef enum btree_prefetch_state { BTREE_PREFETCH_DISABLED = 0, // legacy next_extent_addr path / not applicable @@ -158,6 +160,7 @@ typedef enum btree_prefetch_state { typedef struct btree_prefetch_cursor { btree_prefetch_state state; bool32 at_end; // prefetched through the last in-range extent + bool32 going_forward; // current scan direction; reset resets ramp uint32 lookahead; // K: max leaf extents in flight (the cap) uint32 depth; // current ramp-up depth (<= lookahead) uint64 leaf_batch; // mini batch of this iterator's level @@ -390,7 +393,6 @@ DEFINE_ASYNC_STATE(btree_iterator_async_state, 5, local, key, target, local, comparison, position_rule, local, bool32, found, - local, bool32, forward, local, int64, tmp, local, uint64, curr_addr, local, uint64, last_addr, diff --git a/src/mini_allocator.c b/src/mini_allocator.c index 3d584be4..e5771082 100644 --- a/src/mini_allocator.c +++ b/src/mini_allocator.c @@ -990,6 +990,49 @@ mini_meta_cursor_seek_extent(mini_meta_cursor *cursor, } } +mini_meta_cursor_status +mini_meta_cursor_prev(mini_meta_cursor *cursor, + uint64 *extent_addr, + uint64 *batch) +{ + while (TRUE) { + if (cursor->meta_page == NULL) { + return MINI_META_CURSOR_END; + } + + if (cursor->entry_idx > 0) { + cursor->entry_idx--; + meta_entry *entry = first_entry(cursor->meta_page) + cursor->entry_idx; + *extent_addr = meta_entry_extent_addr(cursor->cc, entry); + *batch = meta_entry_batch(entry); + return MINI_META_CURSOR_ENTRY; + } + + // entry_idx == 0: exhausted this page going backward. + mini_meta_hdr *hdr = (mini_meta_hdr *)cursor->meta_page->data; + uint64 prev_addr = hdr->prev_meta_addr; + if (prev_addr == 0) { + return MINI_META_CURSOR_END; + } + + // Non-blocking: keep the current page alive so prev_meta_addr remains + // accessible on a WOULD_BLOCK retry — do NOT release before the load. + page_handle *prev_page = + cache_get(cursor->cc, prev_addr, FALSE, cursor->meta_type); + if (prev_page == NULL) { + cache_prefetch_page(cursor->cc, prev_addr, cursor->meta_type); + return MINI_META_CURSOR_WOULD_BLOCK; + } + + cache_unget(cursor->cc, cursor->meta_page); + cursor->meta_page = prev_page; + cursor->meta_addr = prev_addr; + cursor->num_entries = mini_num_entries(cursor->meta_page); + cursor->entry_idx = cursor->num_entries; + // Loop: entry_idx == num_entries > 0, will decrement and read. + } +} + static void space_use_add_extent(cache *cc, page_type type, uint64 extent_addr, void *out) { diff --git a/src/mini_allocator.h b/src/mini_allocator.h index 944a7e66..fb15784d 100644 --- a/src/mini_allocator.h +++ b/src/mini_allocator.h @@ -167,6 +167,17 @@ mini_meta_cursor_status mini_meta_cursor_seek_extent(mini_meta_cursor *cursor, uint64 target_extent_addr); +// Emit the previous extent entry (reverse allocation order). The cursor must +// have been positioned by mini_meta_cursor_seek_extent() or a prior call to +// mini_meta_cursor_prev() — calling on a freshly-initialized cursor returns END. +// Non-blocking: if the previous meta page isn't resident, issues a single-page +// prefetch and returns MINI_META_CURSOR_WOULD_BLOCK; the current page is kept +// alive so the retry can follow prev_meta_addr without re-reading. +mini_meta_cursor_status +mini_meta_cursor_prev(mini_meta_cursor *cursor, + uint64 *extent_addr, + uint64 *batch); + /* Return total bytes allocated by the mini_allocator, including space used by * the mini_allocator itself.*/ uint64 diff --git a/src/splinterdb.c b/src/splinterdb.c index d4074bc1..86868aa7 100644 --- a/src/splinterdb.c +++ b/src/splinterdb.c @@ -284,6 +284,7 @@ splinterdb_init_config(const splinterdb_config *kvs_cfg, // IN cfg.memtable_capacity, cfg.fanout, cfg.btree_rough_count_height, + cfg.prefetch_budget, cfg.use_stats); rc = core_config_init(&kvs->trunk_cfg, diff --git a/src/trunk.c b/src/trunk.c index d92e5200..f1f98fe6 100644 --- a/src/trunk.c +++ b/src/trunk.c @@ -2308,16 +2308,20 @@ static void trunk_branch_merger_init(trunk_branch_merger *merger, platform_heap_id hid, const data_config *data_cfg, + cache *cc, + uint64 prefetch_budget, key min_key, key max_key, uint64 height) { - merger->hid = hid; - merger->data_cfg = data_cfg; - merger->min_key = min_key; - merger->max_key = max_key; - merger->height = height; - merger->merge_itor = NULL; + merger->hid = hid; + merger->data_cfg = data_cfg; + merger->cc = cc; + merger->prefetch_budget = prefetch_budget; + merger->min_key = min_key; + merger->max_key = max_key; + merger->height = height; + merger->merge_itor = NULL; vector_init(&merger->itors, hid); } @@ -2441,9 +2445,27 @@ trunk_branch_merger_build_merge_itor(trunk_branch_merger *merger, { platform_assert(merger->merge_itor == NULL); + // A compaction/leaf-split merge reads each input branch end to end, so give + // the branches deep prefetch from the shared budget (Little's law: split the + // read-ahead budget across the branches being merged). Min 2 mirrors the + // range-scan path (see core_prefetch_lookahead). + uint64 num_branches = vector_length(&merger->itors); + if (num_branches > 0 && merger->prefetch_budget > 0 && merger->cc != NULL) { + uint64 budget_extents = + merger->prefetch_budget / cache_extent_size(merger->cc); + uint64 lookahead = budget_extents / num_branches; + if (lookahead < 2) { + lookahead = 2; + } + for (uint64 i = 0; i < num_branches; i++) { + btree_iterator *itor = (btree_iterator *)vector_get(&merger->itors, i); + btree_iterator_set_prefetch_lookahead(itor, (uint32)lookahead); + } + } + return merge_iterator_create(merger->hid, merger->data_cfg, - vector_length(&merger->itors), + num_branches, vector_data(&merger->itors), merge_mode, TRUE, @@ -3944,6 +3966,8 @@ bundle_compaction_task(task *arg) trunk_branch_merger_init(&merger, PROCESS_PRIVATE_HEAP_ID, context->cfg->data_cfg, + context->cc, + context->cfg->prefetch_budget, key_buffer_key(&state->key), key_buffer_key(&state->ubkey), 0); @@ -4549,6 +4573,8 @@ leaf_split_select_pivots(trunk_context *context, trunk_branch_merger_init(&merger, PROCESS_PRIVATE_HEAP_ID, context->cfg->data_cfg, + context->cc, + context->cfg->prefetch_budget, min_key, max_key, context->cfg->branch_rough_count_height); @@ -6625,6 +6651,7 @@ trunk_config_init(trunk_config *config, uint64 incorporation_size_kv_bytes, uint64 target_fanout, uint64 branch_rough_count_height, + uint64 prefetch_budget, bool32 use_stats) { config->data_cfg = data_cfg; @@ -6633,6 +6660,7 @@ trunk_config_init(trunk_config *config, config->incorporation_size_kv_bytes = incorporation_size_kv_bytes; config->target_fanout = target_fanout; config->branch_rough_count_height = branch_rough_count_height; + config->prefetch_budget = prefetch_budget; config->use_stats = use_stats; } diff --git a/src/trunk.h b/src/trunk.h index c4c82956..ffa65c69 100644 --- a/src/trunk.h +++ b/src/trunk.h @@ -26,6 +26,7 @@ typedef struct trunk_config { uint64 incorporation_size_kv_bytes; uint64 target_fanout; uint64 branch_rough_count_height; + uint64 prefetch_budget; // bytes of read-ahead per merge (~BDP) bool32 use_stats; } trunk_config; @@ -183,6 +184,8 @@ typedef struct trunk_ondisk_node_handle { typedef struct trunk_branch_merger { platform_heap_id hid; const data_config *data_cfg; + cache *cc; // for deep-prefetch budget sizing + uint64 prefetch_budget; // bytes of read-ahead across the merge key min_key; key max_key; uint64 height; @@ -202,6 +205,7 @@ trunk_config_init(trunk_config *config, uint64 incorporation_size_kv_bytes, uint64 target_fanout, uint64 branch_rough_count_height, + uint64 prefetch_budget, bool32 use_stats); platform_status diff --git a/tests/functional/scan_benchmark.c b/tests/functional/scan_benchmark.c index 94554856..ab78531f 100644 --- a/tests/functional/scan_benchmark.c +++ b/tests/functional/scan_benchmark.c @@ -18,6 +18,7 @@ typedef enum scan_benchmark_mode { SCAN_BENCHMARK_LOAD_AND_SCAN, SCAN_BENCHMARK_INIT_ONLY, SCAN_BENCHMARK_SCAN_ONLY, + SCAN_BENCHMARK_OPTIMIZE_ONLY, } scan_benchmark_mode; typedef struct scan_benchmark_options { @@ -117,6 +118,10 @@ scan_benchmark_parse_args(int argc, return STATUS_BAD_PARAM; } options->mode = SCAN_BENCHMARK_SCAN_ONLY; + } else if (STRING_EQUALS_LITERAL(argv[i], "--optimize-only")) { + // Open an existing DB (cold cache) and time a blocking full-leaf + // optimize -- a compaction-throughput benchmark. + options->mode = SCAN_BENCHMARK_OPTIMIZE_ONLY; } else if (STRING_EQUALS_LITERAL(argv[i], "--random-load-order")) { options->random_load_order = TRUE; } else if (STRING_EQUALS_LITERAL(argv[i], "--splinter-random-keys")) { @@ -488,6 +493,42 @@ scan_benchmark_load_database(const splinterdb_config *cfg, return rc; } +/* + * Compaction-throughput benchmark: open an existing DB (cold cache) and time a + * single blocking full-leaf optimize over the whole key range. With data > cache + * and O_DIRECT, the branch reads done by compaction's merge iterators are cold, + * so this exercises the btree-iterator prefetch path under compaction. + */ +static int +scan_benchmark_run_optimize(const splinterdb_config *cfg) +{ + splinterdb *kvs = NULL; + int rc = splinterdb_open(cfg, &kvs); + if (rc != 0) { + return rc; + } + + io_reset_stats((io_handle *)splinterdb_get_io_handle(kvs)); + + splinterdb_notification notification; + splinterdb_notification_init_blocking(¬ification); + + platform_default_log("scan_benchmark: running blocking full-leaf optimize\n"); + timestamp start_time = platform_get_timestamp(); + rc = splinterdb_optimize(kvs, NULL_SLICE, NULL_SLICE, TRUE, ¬ification); + uint64 elapsed_ns = platform_timestamp_elapsed(start_time); + splinterdb_notification_deinit(¬ification); + + platform_default_log("optimize complete: rc=%d, %.3fs elapsed\n", + rc, + (double)elapsed_ns / BILLION); + io_print_stats((io_handle *)splinterdb_get_io_handle(kvs), + Platform_default_log_handle); + + splinterdb_close(&kvs); + return rc; +} + static int scan_benchmark_run_scan(const splinterdb_config *cfg, bool print_lookup_stats, @@ -783,7 +824,9 @@ scan_benchmark(int argc, char *argv[]) goto out; } - if (options.mode != SCAN_BENCHMARK_SCAN_ONLY && master_cfg.num_inserts == 0) + if (options.mode != SCAN_BENCHMARK_SCAN_ONLY + && options.mode != SCAN_BENCHMARK_OPTIMIZE_ONLY + && master_cfg.num_inserts == 0) { platform_error_log( "scan_benchmark: --num-inserts must be set for load modes\n"); @@ -819,7 +862,9 @@ scan_benchmark(int argc, char *argv[]) options.backwards_scan, master_cfg.seed); - if (options.mode != SCAN_BENCHMARK_SCAN_ONLY) { + if (options.mode == SCAN_BENCHMARK_LOAD_AND_SCAN + || options.mode == SCAN_BENCHMARK_INIT_ONLY) + { scan_benchmark_make_config(&master_cfg, &default_data_cfg, &cfg, FALSE); rc = scan_benchmark_load_database(&cfg, master_cfg.num_inserts, @@ -833,8 +878,10 @@ scan_benchmark(int argc, char *argv[]) } scan_benchmark_make_config(&master_cfg, &default_data_cfg, &cfg, TRUE); - if (options.scan_count == 1 && options.scan_length == 0 - && !options.random_scan_starts) + if (options.mode == SCAN_BENCHMARK_OPTIMIZE_ONLY) { + rc = scan_benchmark_run_optimize(&cfg); + } else if (options.scan_count == 1 && options.scan_length == 0 + && !options.random_scan_starts) { rc = scan_benchmark_run_scan(&cfg, master_cfg.use_stats, diff --git a/tests/functional/test.h b/tests/functional/test.h index 2cb2de05..82ab07ce 100644 --- a/tests/functional/test.h +++ b/tests/functional/test.h @@ -296,6 +296,7 @@ test_config_init(system_config *system_cfg, // OUT master_cfg->memtable_capacity, master_cfg->fanout, master_cfg->btree_rough_count_height, + master_cfg->prefetch_budget, master_cfg->use_stats); rc = core_config_init(&system_cfg->splinter_cfg, From 90ae80b7cbb19ac8dcfd71d7f8c735dcb84ec623 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Thu, 25 Jun 2026 16:47:37 -0700 Subject: [PATCH 04/15] style checks Signed-off-by: Rob Johnson --- include/splinterdb/splinterdb.h | 9 +- src/btree.c | 274 ++++++++---------------------- src/btree.h | 30 ++-- src/btree_private.h | 12 +- src/clockcache.c | 84 ++++----- src/core.c | 47 +---- src/core.h | 6 +- src/mini_allocator.c | 68 ++++++-- src/mini_allocator.h | 31 ++-- src/task.c | 13 +- src/task.h | 2 + src/trunk.c | 39 ++--- src/trunk.h | 4 +- tests/functional/btree_test.c | 3 +- tests/functional/scan_benchmark.c | 9 +- 15 files changed, 259 insertions(+), 372 deletions(-) diff --git a/include/splinterdb/splinterdb.h b/include/splinterdb/splinterdb.h index cd0f5a18..abb36455 100644 --- a/include/splinterdb/splinterdb.h +++ b/include/splinterdb/splinterdb.h @@ -147,10 +147,11 @@ typedef struct splinterdb_config { // latencies. uint64 queue_scale_percent; - // Total bytes of extent read-ahead a range scan keeps in flight, divided - // across the branches it merges. Roughly the storage's bandwidth-delay - // product (bandwidth x latency); raise it for higher-latency devices such - // as networked/cloud volumes. Zero selects a default suited to local SSDs. + // Soft byte budget for extent read-ahead in range scans and compactions, + // divided across the branches being merged. Roughly the storage's + // bandwidth-delay product (bandwidth x latency); raise it for higher-latency + // devices such as networked/cloud volumes. Zero selects a default suited to + // local SSDs. uint64 prefetch_budget; } splinterdb_config; diff --git a/src/btree.c b/src/btree.c index 57f391dd..87e25831 100644 --- a/src/btree.c +++ b/src/btree.c @@ -2862,13 +2862,25 @@ btree_extent_base_addr(cache *cc, uint64 addr) return allocator_config_extent_base_addr(allocator_get_config(al), addr); } +static inline mini_meta_cursor_status +btree_prefetch_cursor_step(btree_prefetch_cursor *pf, + uint64 *extent_addr, + uint64 *batch) +{ + if (pf->going_forward) { + return mini_meta_cursor_next(&pf->meta_cursor, extent_addr, batch); + } + return mini_meta_cursor_prev(&pf->meta_cursor, extent_addr, batch); +} + /* - * Issue prefetches until ~depth leaf extents are in flight, or we reach the - * iterator's end extent or the end of the extent stream. Leaf extents count - * toward the depth; blob extents in the window are prefetched but not counted; - * internal-node extents are skipped. Non-blocking: if a meta page needed to - * read further ahead isn't resident yet, fill stops early (a single-page - * prefetch for it was issued) and resumes on a later boundary. + * Issue prefetches until ~depth leaf extents are in flight, or the stream is + * exhausted. Leaf extents count toward the depth; blob extents in the window + * are prefetched but not counted; internal-node extents are skipped. Forward + * fill stops at end_addr. Backward fill stops at the beginning of the extent + * stream. Non-blocking: if a meta page needed to read further ahead isn't + * resident yet, fill stops early (a single-page prefetch was issued) and + * resumes later. */ static void btree_prefetch_cursor_fill(btree_iterator *itor) @@ -2878,9 +2890,9 @@ btree_prefetch_cursor_fill(btree_iterator *itor) uint64 extent_addr; uint64 batch; mini_meta_cursor_status status = - mini_meta_cursor_next(&pf->meta_cursor, &extent_addr, &batch); + btree_prefetch_cursor_step(pf, &extent_addr, &batch); if (status == MINI_META_CURSOR_WOULD_BLOCK) { - break; // meta page not resident; prefetch issued, retry next boundary. + break; } if (status == MINI_META_CURSOR_END) { pf->at_end = TRUE; @@ -2889,24 +2901,24 @@ btree_prefetch_cursor_fill(btree_iterator *itor) if (batch == pf->leaf_batch) { cache_prefetch(itor->cc, extent_addr, itor->page_type); pf->prefetched_ahead++; - // Never prefetch past the extent that contains end_addr. - if (btree_addrs_share_extent(itor->cc, extent_addr, itor->end_addr)) { + if (pf->going_forward + && btree_addrs_share_extent(itor->cc, extent_addr, itor->end_addr)) + { pf->at_end = TRUE; } } else if (pf->prefetch_blobs && batch < NUM_BLOB_BATCHES) { cache_prefetch(itor->cc, extent_addr, PAGE_TYPE_BLOB); } - // else: internal-node extent (batch > leaf_batch) -- skip. } } /* * Try to position the (PRIMING) cursor at the iterator's current leaf extent. - * Non-blocking: kicks off the meta-page IO (via mini_meta_cursor) and, if the - * page isn't resident yet, leaves the cursor PRIMING to be retried later. Reads - * the *current* leaf's meta_page_addr each call, so it positions correctly even - * if the iterator advanced across extents while priming. Returns TRUE iff the - * cursor just became ACTIVE (positioned and initial fill done). + * Non-blocking: kicks off meta-page IO and leaves the cursor PRIMING when the + * page is not resident yet. Reads the current leaf's meta_page_addr every call, + * so it positions correctly even if the iterator advanced while priming. For + * backward scans, consumes the current extent after seeking so fill starts with + * the previous extent. Returns TRUE iff the cursor just became ACTIVE. */ static bool32 btree_prefetch_cursor_pump(btree_iterator *itor) @@ -2920,7 +2932,6 @@ btree_prefetch_cursor_pump(btree_iterator *itor) return FALSE; } - // (Re)anchor the meta cursor at the current leaf's meta page. mini_meta_cursor_deinit(&pf->meta_cursor); mini_meta_cursor_init( &pf->meta_cursor, itor->cc, itor->page_type, meta_page_addr); @@ -2928,17 +2939,22 @@ btree_prefetch_cursor_pump(btree_iterator *itor) mini_meta_cursor_status status = mini_meta_cursor_seek_extent(&pf->meta_cursor, cur_extent); if (status == MINI_META_CURSOR_WOULD_BLOCK) { - return FALSE; // still priming; meta-page prefetch issued by the seek. + return FALSE; } if (status != MINI_META_CURSOR_ENTRY) { - // Couldn't locate our extent in this meta page: fall back to legacy. + // Couldn't locate our extent from this stamp; fall back to legacy. mini_meta_cursor_deinit(&pf->meta_cursor); pf->state = BTREE_PREFETCH_DISABLED; return FALSE; } - // Positioned. Activate and prime the lookahead window. Ramp up from RAMP_MIN - // to avoid wasting bandwidth on short scans. + if (!pf->going_forward) { + uint64 ignored_addr, ignored_batch; + status = + mini_meta_cursor_prev(&pf->meta_cursor, &ignored_addr, &ignored_batch); + platform_assert(status == MINI_META_CURSOR_ENTRY); + } + pf->state = BTREE_PREFETCH_ACTIVE; pf->at_end = FALSE; pf->prefetched_ahead = 0; @@ -2948,9 +2964,9 @@ btree_prefetch_cursor_pump(btree_iterator *itor) } /* - * Called when the iterator crosses forward into a new leaf extent while the - * cursor is ACTIVE: account for the consumed extent, ramp the depth up toward - * the configured cap, and refill the lookahead window. + * Called when the iterator crosses into a new leaf extent while the cursor is + * ACTIVE: account for the consumed extent, ramp the depth toward the configured + * cap, and refill the lookahead window. */ static void btree_prefetch_cursor_on_boundary(btree_iterator *itor) @@ -2959,7 +2975,6 @@ btree_prefetch_cursor_on_boundary(btree_iterator *itor) if (pf->prefetched_ahead > 0) { pf->prefetched_ahead--; } - // Ramp up (slow-start): the scan has proven longer, so read further ahead. if (pf->depth < pf->lookahead) { pf->depth *= 2; if (pf->depth > pf->lookahead) { @@ -2970,29 +2985,25 @@ btree_prefetch_cursor_on_boundary(btree_iterator *itor) } /* - * (Re)start deep prefetch at the iterator's current leaf (used at init and on - * seek). Non-blocking: kicks off the meta-page IO and leaves the cursor PRIMING - * (to be completed lazily as the scan advances) unless the meta page is already - * resident, in which case it becomes ACTIVE immediately. Falls back to DISABLED - * (legacy single-extent prefetch) when deep prefetch does not apply. + * (Re)start deep prefetch at the iterator's current leaf. Non-blocking: kicks + * off meta-page IO and leaves the cursor PRIMING unless the meta page is + * already resident. Falls back to DISABLED (legacy single-extent prefetch) when + * deep prefetch does not apply. */ static void -btree_prefetch_cursor_start(btree_iterator *itor) +btree_prefetch_cursor_start(btree_iterator *itor, bool32 going_forward) { btree_prefetch_cursor *pf = &itor->prefetch; - // Reset any previously-active cursor (e.g. on a direction change or seek). mini_meta_cursor_deinit(&pf->meta_cursor); pf->state = BTREE_PREFETCH_DISABLED; - pf->going_forward = TRUE; + pf->going_forward = going_forward; pf->at_end = FALSE; pf->prefetched_ahead = 0; pf->depth = BTREE_PREFETCH_RAMP_MIN; pf->leaf_batch = NUM_BLOB_BATCHES + itor->height; pf->prefetch_blobs = (itor->height == 0); - // Deep prefetch applies only to finalized branches with a lookahead of 2+; - // everything else uses the legacy next_extent_addr path. if (!itor->do_prefetch || pf->lookahead <= 1 || itor->page_type != PAGE_TYPE_BRANCH || itor->curr.page == NULL) { @@ -3003,135 +3014,6 @@ btree_prefetch_cursor_start(btree_iterator *itor) btree_prefetch_cursor_pump(itor); } -/* - * Issue prefetches backward until ~depth leaf extents are in flight, or the - * stream is exhausted. Non-blocking: stops if a meta page isn't resident yet - * (the WOULD_BLOCK from cursor_prev; a prefetch was already issued). - */ -static void -btree_prefetch_cursor_fill_backward(btree_iterator *itor) -{ - btree_prefetch_cursor *pf = &itor->prefetch; - while (!pf->at_end && pf->prefetched_ahead < pf->depth) { - uint64 extent_addr; - uint64 batch; - mini_meta_cursor_status status = - mini_meta_cursor_prev(&pf->meta_cursor, &extent_addr, &batch); - if (status == MINI_META_CURSOR_WOULD_BLOCK) { - break; - } - if (status == MINI_META_CURSOR_END) { - pf->at_end = TRUE; - break; - } - if (batch == pf->leaf_batch) { - cache_prefetch(itor->cc, extent_addr, itor->page_type); - pf->prefetched_ahead++; - } else if (pf->prefetch_blobs && batch < NUM_BLOB_BATCHES) { - cache_prefetch(itor->cc, extent_addr, PAGE_TYPE_BLOB); - } - // else: internal-node extent -- skip. - } -} - -/* - * Try to position the (PRIMING) backward cursor at the iterator's current leaf. - * Re-anchors via a forward seek (from meta_page_addr to the current extent), - * then consumes the current extent with one cursor_prev so that subsequent - * fill_backward calls emit extents before the current one. Non-blocking: returns - * FALSE and stays PRIMING if any meta page is not yet resident. - */ -static bool32 -btree_prefetch_cursor_pump_backward(btree_iterator *itor) -{ - btree_prefetch_cursor *pf = &itor->prefetch; - - uint64 meta_page_addr = itor->curr.hdr->meta_page_addr; - if (meta_page_addr == 0) { - pf->state = BTREE_PREFETCH_DISABLED; - return FALSE; - } - - // Re-anchor: start a fresh forward seek from the current leaf's meta page. - mini_meta_cursor_deinit(&pf->meta_cursor); - mini_meta_cursor_init( - &pf->meta_cursor, itor->cc, itor->page_type, meta_page_addr); - uint64 cur_extent = btree_extent_base_addr(itor->cc, itor->curr.addr); - mini_meta_cursor_status status = - mini_meta_cursor_seek_extent(&pf->meta_cursor, cur_extent); - if (status == MINI_META_CURSOR_WOULD_BLOCK) { - return FALSE; - } - if (status != MINI_META_CURSOR_ENTRY) { - mini_meta_cursor_deinit(&pf->meta_cursor); - pf->state = BTREE_PREFETCH_DISABLED; - return FALSE; - } - - // Cursor is now just past cur_extent. Wind back one entry to consume - // cur_extent: after seek, entry_idx >= 1, so cursor_prev always succeeds. - uint64 ignored_addr, ignored_batch; - status = mini_meta_cursor_prev(&pf->meta_cursor, &ignored_addr, &ignored_batch); - platform_assert(status == MINI_META_CURSOR_ENTRY); - - // Activated. Prime the backward lookahead window. - pf->state = BTREE_PREFETCH_ACTIVE; - pf->at_end = FALSE; - pf->prefetched_ahead = 0; - pf->depth = BTREE_PREFETCH_RAMP_MIN; - btree_prefetch_cursor_fill_backward(itor); - return TRUE; -} - -/* - * Called when the iterator crosses backward into a new leaf extent while the - * cursor is ACTIVE: account for the consumed extent, ramp the depth, refill. - */ -static void -btree_prefetch_cursor_on_boundary_backward(btree_iterator *itor) -{ - btree_prefetch_cursor *pf = &itor->prefetch; - if (pf->prefetched_ahead > 0) { - pf->prefetched_ahead--; - } - if (pf->depth < pf->lookahead) { - pf->depth *= 2; - if (pf->depth > pf->lookahead) { - pf->depth = pf->lookahead; - } - } - btree_prefetch_cursor_fill_backward(itor); -} - -/* - * (Re)start backward deep prefetch at the iterator's current leaf. Non-blocking: - * kicks off the meta-page IO and leaves the cursor PRIMING (to be completed - * lazily as the scan advances) unless the meta page is already resident. - */ -static void -btree_prefetch_cursor_start_backward(btree_iterator *itor) -{ - btree_prefetch_cursor *pf = &itor->prefetch; - - mini_meta_cursor_deinit(&pf->meta_cursor); - pf->state = BTREE_PREFETCH_DISABLED; - pf->going_forward = FALSE; - pf->at_end = FALSE; - pf->prefetched_ahead = 0; - pf->depth = BTREE_PREFETCH_RAMP_MIN; - pf->leaf_batch = NUM_BLOB_BATCHES + itor->height; - pf->prefetch_blobs = (itor->height == 0); - - if (!itor->do_prefetch || pf->lookahead <= 1 - || itor->page_type != PAGE_TYPE_BRANCH || itor->curr.page == NULL) - { - return; - } - - pf->state = BTREE_PREFETCH_PRIMING; - btree_prefetch_cursor_pump_backward(itor); -} - /* Release the cursor's resources and turn it off. */ static void btree_prefetch_cursor_deinit(btree_iterator *itor) @@ -3157,43 +3039,30 @@ btree_iterator_prefetch_on_advance(btree_iterator *itor, btree_prefetch_cursor *pf = &itor->prefetch; // Direction change: restart cursor in the new direction, resetting ramp. - if (pf->state != BTREE_PREFETCH_DISABLED && pf->going_forward != going_forward) { - if (going_forward) { - btree_prefetch_cursor_start(itor); - } else { - btree_prefetch_cursor_start_backward(itor); - } + if (pf->state != BTREE_PREFETCH_DISABLED + && pf->going_forward != going_forward) + { + btree_prefetch_cursor_start(itor, going_forward); return; } - if (going_forward) { - bool32 positioned_now = FALSE; - if (pf->state == BTREE_PREFETCH_PRIMING) { - positioned_now = btree_prefetch_cursor_pump(itor); - } - if (btree_addrs_share_extent(cc, last_addr, itor->curr.addr)) { - return; - } - if (pf->state == BTREE_PREFETCH_ACTIVE) { - if (!positioned_now) { - btree_prefetch_cursor_on_boundary(itor); - } - } else if (itor->do_prefetch && itor->curr.hdr->next_extent_addr != 0 - && !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr)) - { - cache_prefetch(cc, itor->curr.hdr->next_extent_addr, itor->page_type); - } - } else { - bool32 positioned_now = FALSE; - if (pf->state == BTREE_PREFETCH_PRIMING) { - positioned_now = btree_prefetch_cursor_pump_backward(itor); - } - if (btree_addrs_share_extent(cc, last_addr, itor->curr.addr)) { - return; - } - if (pf->state == BTREE_PREFETCH_ACTIVE && !positioned_now) { - btree_prefetch_cursor_on_boundary_backward(itor); + bool32 positioned_now = FALSE; + if (pf->state == BTREE_PREFETCH_PRIMING) { + positioned_now = btree_prefetch_cursor_pump(itor); + } + if (btree_addrs_share_extent(cc, last_addr, itor->curr.addr)) { + return; + } + if (pf->state == BTREE_PREFETCH_ACTIVE) { + if (!positioned_now) { + btree_prefetch_cursor_on_boundary(itor); } + } else if (going_forward && itor->do_prefetch + && itor->curr.hdr->next_extent_addr != 0 + && !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr)) + { + cache_prefetch(cc, itor->curr.hdr->next_extent_addr, itor->page_type); + } else if (!going_forward) { // No legacy backward prefetch: leaf headers have no prev_extent_addr. } } @@ -3700,7 +3569,7 @@ btree_iterator_seek(iterator *base_itor, comparison seek_type, key seek_key) // The iterator may have repositioned; re-anchor the prefetch cursor so a // subsequent forward scan prefetches from the new location. - btree_prefetch_cursor_start(itor); + btree_prefetch_cursor_start(itor, TRUE); return STATUS_OK; } @@ -3851,7 +3720,7 @@ btree_iterator_init(cache *cc, find_btree_node_and_get_idx_bounds(itor, start_key, start_type); - btree_prefetch_cursor_start(itor); + btree_prefetch_cursor_start(itor, TRUE); // While the deep cursor is priming (or disabled), cover the next extent with // the legacy single-extent-ahead prefetch via next_extent_addr. if (itor->prefetch.state != BTREE_PREFETCH_ACTIVE && itor->do_prefetch @@ -3871,11 +3740,14 @@ void btree_iterator_set_prefetch_lookahead(btree_iterator *itor, uint32 prefetch_lookahead) { + platform_assert(itor != NULL); + platform_assert(itor->do_prefetch); + itor->prefetch.lookahead = prefetch_lookahead; // Re-anchor the cursor at the current position with the new lookahead. This // also issues an initial prefetch (replacing the legacy one-extent prefetch // from init when the cursor engages). - btree_prefetch_cursor_start(itor); + btree_prefetch_cursor_start(itor, TRUE); } async_status @@ -3906,7 +3778,7 @@ btree_iterator_init_async(btree_iterator_async_state *state) async_await_subroutine(state, find_btree_node_and_get_idx_bounds_async); btree_iterator_copy_curr_if_needed(state->itor); - btree_prefetch_cursor_start(state->itor); + btree_prefetch_cursor_start(state->itor, TRUE); // While the deep cursor is priming (or disabled), cover the next extent with // the legacy single-extent-ahead prefetch via next_extent_addr. if (state->itor->prefetch.state != BTREE_PREFETCH_ACTIVE diff --git a/src/btree.h b/src/btree.h index fef42257..7629d0ad 100644 --- a/src/btree.h +++ b/src/btree.h @@ -133,20 +133,20 @@ typedef struct ONDISK btree_pivot_data { /* * Drives extent prefetching for a btree_iterator in either direction. Reads * extent addresses ahead of (or behind) the iterator from the branch's - * mini_allocator (via a mini_meta_cursor, exploiting that extents within a batch - * are in key order) and issues cache_prefetch for them, keeping up to ~depth leaf - * extents of IO in flight. Internal-node extents are skipped; blob extents are - * prefetched (for height-0 scans). + * mini_allocator (via a mini_meta_cursor, exploiting that extents within a + * batch are in key order) and issues cache_prefetch for them, keeping up to + * ~depth leaf extents of IO in flight. Internal-node extents are skipped; blob + * extents are prefetched (for height-0 scans). * * Priming is non-blocking: the cursor's meta page is fetched lazily (PRIMING * state) so the iterator's async init never waits on it and the first tuple is * not delayed. The legacy single-extent-ahead prefetch (via the leaf's * next_extent_addr) covers the window until the cursor becomes ACTIVE. * - * Depth ramps up (slow-start) from BTREE_PREFETCH_RAMP_MIN toward `lookahead` as - * the scan proves long, so short scans don't waste bandwidth reading far ahead. - * On a direction change the ramp resets so both forward and backward scans get - * the same slow-start treatment. + * Depth ramps up (slow-start) from BTREE_PREFETCH_RAMP_MIN toward `lookahead` + * as the scan proves long, so short scans don't waste bandwidth reading far + * ahead. On a direction change the ramp resets so both forward and backward + * scans get the same slow-start treatment. */ typedef enum btree_prefetch_state { BTREE_PREFETCH_DISABLED = 0, // legacy next_extent_addr path / not applicable @@ -159,13 +159,13 @@ typedef enum btree_prefetch_state { typedef struct btree_prefetch_cursor { btree_prefetch_state state; - bool32 at_end; // prefetched through the last in-range extent - bool32 going_forward; // current scan direction; reset resets ramp - uint32 lookahead; // K: max leaf extents in flight (the cap) - uint32 depth; // current ramp-up depth (<= lookahead) - uint64 leaf_batch; // mini batch of this iterator's level - bool32 prefetch_blobs; // also prefetch blob extents (height 0) - uint64 prefetched_ahead; // leaf extents prefetched, not yet consumed + bool32 at_end; // prefetched through the last in-range extent + bool32 going_forward; // current scan direction; reset resets ramp + uint32 lookahead; // K: max leaf extents in flight (the cap) + uint32 depth; // current ramp-up depth (<= lookahead) + uint64 leaf_batch; // mini batch of this iterator's level + bool32 prefetch_blobs; // also prefetch blob extents (height 0) + uint64 prefetched_ahead; // leaf extents prefetched, not yet consumed mini_meta_cursor meta_cursor; } btree_prefetch_cursor; diff --git a/src/btree_private.h b/src/btree_private.h index f1083b3c..0d99375b 100644 --- a/src/btree_private.h +++ b/src/btree_private.h @@ -32,13 +32,13 @@ typedef node_offset table_entry; * ************************************************************************* */ struct ONDISK btree_hdr { - uint64 prev_addr; - uint64 next_addr; - uint64 next_extent_addr; + uint64 prev_addr; + uint64 next_addr; + uint64 next_extent_addr; // Address of the mini_allocator meta page that lists this node's extent. - // Stamped at pack time (see btree_pack_create_next_node); lets a forward - // (or, later, backward) prefetch cursor jump straight to this node's - // position in the extent stream instead of scanning from meta_head. + // Stamped at pack time (see btree_pack_create_next_node); lets the + // bidirectional prefetch cursor jump straight to this node's position in the + // extent stream instead of scanning from meta_head. uint64 meta_page_addr; uint64 generation; uint8 height; diff --git a/src/clockcache.c b/src/clockcache.c index 85beccf7..4473c781 100644 --- a/src/clockcache.c +++ b/src/clockcache.c @@ -1804,7 +1804,7 @@ clockcache_get_internal(clockcache *cc, // IN page_handle * clockcache_get(clockcache *cc, uint64 addr, bool32 blocking, page_type type) { - bool32 retry; + bool32 retry; // Initialize to NULL so a non-blocking get of a page that is not in cache // (clockcache_get_internal returns without setting handle) honors the // documented contract of returning NULL rather than an uninitialized value. @@ -2684,7 +2684,9 @@ clockcache_prefetch_page(clockcache *cc, uint64 addr, page_type type) { threadid tid = platform_get_tid(); - debug_assert(addr % clockcache_page_size(cc) == 0); + platform_assert(cc != NULL); + platform_assert(PAGE_TYPE_FIRST <= type && type < NUM_PAGE_TYPES); + platform_assert(addr % clockcache_page_size(cc) == 0); while (TRUE) { uint32 entry_no = clockcache_lookup(cc, addr); @@ -2713,7 +2715,8 @@ clockcache_prefetch_page(clockcache *cc, uint64 addr, page_type type) entry->type = type; uint64 lookup_no = clockcache_divide_by_page_size(cc, addr); - async_io_state *state = TYPED_MALLOC(PROCESS_PRIVATE_HEAP_ID, state); + async_io_state *state = + TYPED_MALLOC(PROCESS_PRIVATE_HEAP_ID, state); if (state == NULL) { platform_error_log("clockcache_prefetch_page: async_io_state " "allocation failed for page addr %lu, " @@ -2723,13 +2726,14 @@ clockcache_prefetch_page(clockcache *cc, uint64 addr, page_type type) clockcache_release_unpublished_entry(entry); return; } - state->cc = cc; - platform_status rc = io_async_state_init(state->iostate, - cc->io, - io_async_preadv, - addr, - clockcache_prefetch_callback, - state); + state->cc = cc; + platform_status rc = + io_async_state_init(state->iostate, + cc->io, + io_async_preadv, + addr, + clockcache_prefetch_callback, + state); if (!SUCCESS(rc)) { platform_error_log("clockcache_prefetch_page: " "io_async_state_init failed for page addr " @@ -2745,8 +2749,8 @@ clockcache_prefetch_page(clockcache *cc, uint64 addr, page_type type) if (__sync_bool_compare_and_swap( &cc->lookup[lookup_no], CC_UNMAPPED_ENTRY, free_entry_no)) { - rc = io_async_state_append_page(state->iostate, - entry->page.data); + rc = + io_async_state_append_page(state->iostate, entry->page.data); if (!SUCCESS(rc)) { platform_error_log("clockcache_prefetch_page: " "io_async_state_append_page failed for " @@ -3361,35 +3365,35 @@ static cache_ops clockcache_ops = { .page_get_async = clockcache_get_async_virtual, .page_get_async_result = clockcache_get_async_state_result_virtual, - .page_unget = clockcache_unget_virtual, - .page_try_claim = clockcache_try_claim_virtual, - .page_unclaim = clockcache_unclaim_virtual, - .page_lock = clockcache_lock_virtual, - .page_unlock = clockcache_unlock_virtual, - .page_prefetch = clockcache_prefetch_virtual, + .page_unget = clockcache_unget_virtual, + .page_try_claim = clockcache_try_claim_virtual, + .page_unclaim = clockcache_unclaim_virtual, + .page_lock = clockcache_lock_virtual, + .page_unlock = clockcache_unlock_virtual, + .page_prefetch = clockcache_prefetch_virtual, .page_prefetch_page = clockcache_prefetch_page_virtual, - .page_mark_dirty = clockcache_mark_dirty_virtual, - .page_pin = clockcache_pin_virtual, - .page_unpin = clockcache_unpin_virtual, - .page_sync = clockcache_page_sync_virtual, - .extent_sync = clockcache_extent_sync_virtual, - .flush = clockcache_flush_virtual, - .evict = clockcache_evict_all_virtual, - .cleanup = clockcache_wait_virtual, - .in_use = clockcache_in_use_virtual, - .assert_ungot = clockcache_assert_ungot_virtual, - .assert_free = clockcache_assert_no_locks_held_virtual, - .print = clockcache_print_virtual, - .print_stats = clockcache_print_stats_virtual, - .io_stats = clockcache_io_stats_virtual, - .reset_stats = clockcache_reset_stats_virtual, - .validate_page = clockcache_validate_page_virtual, - .count_dirty = clockcache_count_dirty_virtual, - .page_get_read_ref = clockcache_get_read_ref_virtual, - .cache_present = clockcache_present_virtual, - .enable_sync_get = clockcache_enable_sync_get_virtual, - .get_allocator = clockcache_get_allocator_virtual, - .get_config = clockcache_get_config_virtual, + .page_mark_dirty = clockcache_mark_dirty_virtual, + .page_pin = clockcache_pin_virtual, + .page_unpin = clockcache_unpin_virtual, + .page_sync = clockcache_page_sync_virtual, + .extent_sync = clockcache_extent_sync_virtual, + .flush = clockcache_flush_virtual, + .evict = clockcache_evict_all_virtual, + .cleanup = clockcache_wait_virtual, + .in_use = clockcache_in_use_virtual, + .assert_ungot = clockcache_assert_ungot_virtual, + .assert_free = clockcache_assert_no_locks_held_virtual, + .print = clockcache_print_virtual, + .print_stats = clockcache_print_stats_virtual, + .io_stats = clockcache_io_stats_virtual, + .reset_stats = clockcache_reset_stats_virtual, + .validate_page = clockcache_validate_page_virtual, + .count_dirty = clockcache_count_dirty_virtual, + .page_get_read_ref = clockcache_get_read_ref_virtual, + .cache_present = clockcache_present_virtual, + .enable_sync_get = clockcache_enable_sync_get_virtual, + .get_allocator = clockcache_get_allocator_virtual, + .get_config = clockcache_get_config_virtual, }; /* diff --git a/src/core.c b/src/core.c index c870dd6a..4cf7ab3d 100644 --- a/src/core.c +++ b/src/core.c @@ -13,6 +13,7 @@ #include "platform_sleep.h" #include "platform_time.h" #include "platform_util.h" +#include "prefetch.h" #include "poison.h" #define LATENCYHISTO_SIZE 15 @@ -56,37 +57,6 @@ _Static_assert(CORE_NUM_MEMTABLES <= MAX_MEMTABLES, /* Some randomly chosen Splinter super-block checksum seed. */ #define CORE_SUPER_CSUM_SEED (42) -/* - * Minimum extent-prefetch depth for an eligible branch in a range scan. Keeping - * at least this many extents in flight is what makes deep prefetch worthwhile - * compared to the legacy single-extent-ahead path. - */ -#define CORE_MIN_PREFETCH_LOOKAHEAD (2) - -/* - * Per-branch extent-prefetch depth for a range scan. The configured prefetch - * budget (total bytes of read-ahead to keep in flight, ~ the storage's - * bandwidth-delay product) is converted to extents and divided across the - * eligible branches, with a floor of CORE_MIN_PREFETCH_LOOKAHEAD per branch. - * Dividing by the branch count bounds total outstanding read-ahead while still - * going deep when few branches dominate (a lone large branch gets the whole - * budget). Returns 0 when nothing is eligible to prefetch. - */ -static uint32 -core_prefetch_lookahead(core_handle *spl, uint64 n_eligible) -{ - if (n_eligible == 0) { - return 0; - } - uint64 budget_extents = - spl->cfg.prefetch_budget / cache_extent_size(spl->cc); - uint64 per_branch = budget_extents / n_eligible; - if (per_branch < CORE_MIN_PREFETCH_LOOKAHEAD) { - per_branch = CORE_MIN_PREFETCH_LOOKAHEAD; - } - return (uint32)per_branch; -} - /* * core logging functions. * @@ -1221,7 +1191,7 @@ core_range_iterator_init(core_handle *spl, // Deep extent-prefetch for the scan: count the branches eligible to prefetch // (compacted, and only when the scan is large enough to be worth it), then - // give each a share of the prefetch budget (see core_prefetch_lookahead). + // give each a soft share of the prefetch budget. uint64 n_prefetch_branches = 0; for (uint64 branch_no = 0; branch_no < range_itor->num_branches; branch_no++) { @@ -1229,15 +1199,16 @@ core_range_iterator_init(core_handle *spl, n_prefetch_branches++; } } - uint32 deep_lookahead = core_prefetch_lookahead(spl, n_prefetch_branches); + uint32 deep_lookahead = prefetch_budget_to_extent_lookahead( + spl->cc, spl->cfg.prefetch_budget, n_prefetch_branches); uint64 started_inits = 0; for (uint64 i = 0; i < range_itor->num_branches; i++) { - uint64 branch_no = range_itor->num_branches - i - 1; - btree_iterator *btree_itor = &range_itor->btree_itor[branch_no]; - uint64 branch_addr = range_itor->branch[branch_no].addr; - page_type page_type = range_itor->branch[branch_no].type; - bool32 do_prefetch = FALSE; + uint64 branch_no = range_itor->num_branches - i - 1; + btree_iterator *btree_itor = &range_itor->btree_itor[branch_no]; + uint64 branch_addr = range_itor->branch[branch_no].addr; + page_type page_type = range_itor->branch[branch_no].type; + bool32 do_prefetch = FALSE; uint32 prefetch_lookahead = 1; if (range_itor->compacted[branch_no] && num_tuples > CORE_PREFETCH_MIN) { do_prefetch = TRUE; diff --git a/src/core.h b/src/core.h index 0f6439e6..440a4829 100644 --- a/src/core.h +++ b/src/core.h @@ -39,9 +39,9 @@ typedef struct core_config { uint64 queue_scale_percent; // Governs when inserters perform bg tasks. See // task.h - // Total bytes of extent read-ahead a range scan keeps in flight, divided - // across the branches it merges (see core_prefetch_lookahead). Roughly the - // storage's bandwidth-delay product; raise it for higher-latency devices. + // Soft byte budget for range-scan extent read-ahead, divided across the + // branches being merged. Roughly the storage's bandwidth-delay product; + // raise it for higher-latency devices. uint64 prefetch_budget; bool32 use_stats; // stats diff --git a/src/mini_allocator.c b/src/mini_allocator.c index e5771082..4984365c 100644 --- a/src/mini_allocator.c +++ b/src/mini_allocator.c @@ -46,7 +46,7 @@ typedef struct ONDISK mini_meta_hdr { * meta_entry -- Disk-resident structure * * Metadata for each extent stored in the extent list for a - * mini_allocator. Currently, this is just the extent address itself. + * mini_allocator. *----------------------------------------------------------------------------- */ /* @@ -65,9 +65,10 @@ typedef struct ONDISK meta_entry { uint64 packed; } meta_entry; -#define META_ENTRY_BATCH_BITS (8) -#define META_ENTRY_TYPE_BITS (8) -#define META_ENTRY_EXTENT_BITS (64 - META_ENTRY_BATCH_BITS - META_ENTRY_TYPE_BITS) +#define META_ENTRY_BATCH_BITS (8) +#define META_ENTRY_TYPE_BITS (8) +#define META_ENTRY_EXTENT_BITS \ + (64 - META_ENTRY_BATCH_BITS - META_ENTRY_TYPE_BITS) _Static_assert(MINI_MAX_BATCHES <= (1 << META_ENTRY_BATCH_BITS), "mini_allocator batch number does not fit in a meta_entry"); @@ -102,12 +103,17 @@ meta_entry_pack(cache *cc, page_type type, uint64 batch) { + platform_assert(cc != NULL); + platform_assert(entry != NULL); + uint64 extent_size = cache_extent_size(cc); uint64 extent_number = extent_addr / extent_size; - debug_assert((extent_addr % extent_size) == 0); - debug_assert(extent_number < (1ULL << META_ENTRY_EXTENT_BITS)); - debug_assert((uint64)type < (1 << META_ENTRY_TYPE_BITS)); - debug_assert(batch < (1 << META_ENTRY_BATCH_BITS)); + platform_assert(extent_addr != 0); + platform_assert((extent_addr % extent_size) == 0); + platform_assert(extent_number < (1ULL << META_ENTRY_EXTENT_BITS)); + platform_assert(PAGE_TYPE_FIRST <= type && type < NUM_PAGE_TYPES); + platform_assert(batch < MINI_MAX_BATCHES); + entry->packed = (extent_number << (META_ENTRY_BATCH_BITS + META_ENTRY_TYPE_BITS)) | ((uint64)type << META_ENTRY_BATCH_BITS) | batch; @@ -348,9 +354,15 @@ mini_append_entry_to_page(mini_allocator *mini, page_type type, uint64 batch) { + platform_assert(mini != NULL); + platform_assert(mini->cc != NULL); + platform_assert(meta_page != NULL); + platform_assert(PAGE_TYPE_FIRST <= type && type < NUM_PAGE_TYPES); + platform_assert(batch < mini->num_batches); + uint64 page_size = cache_page_size(mini->cc); - debug_assert(extent_addr != 0); - debug_assert((extent_addr % page_size) == 0); + platform_assert(extent_addr != 0); + platform_assert((extent_addr % page_size) == 0); mini_meta_hdr *hdr = (mini_meta_hdr *)meta_page->data; @@ -907,7 +919,7 @@ mini_prefetch(cache *cc, page_type type, uint64 meta_head) /* *----------------------------------------------------------------------------- - * mini_meta_cursor -- forward cursor over a mini_allocator's extent entries. + * mini_meta_cursor -- cursor over a mini_allocator's extent entries. *----------------------------------------------------------------------------- */ void @@ -916,6 +928,11 @@ mini_meta_cursor_init(mini_meta_cursor *cursor, page_type meta_type, uint64 meta_addr) { + platform_assert(cursor != NULL); + platform_assert(cc != NULL); + platform_assert(PAGE_TYPE_FIRST <= meta_type && meta_type < NUM_PAGE_TYPES); + platform_assert(meta_addr != 0); + cursor->cc = cc; cursor->meta_type = meta_type; cursor->meta_page = NULL; @@ -927,6 +944,8 @@ mini_meta_cursor_init(mini_meta_cursor *cursor, void mini_meta_cursor_deinit(mini_meta_cursor *cursor) { + platform_assert(cursor != NULL); + if (cursor->meta_page != NULL) { cache_unget(cursor->cc, cursor->meta_page); cursor->meta_page = NULL; @@ -938,15 +957,22 @@ mini_meta_cursor_next(mini_meta_cursor *cursor, uint64 *extent_addr, uint64 *batch) { + platform_assert(cursor != NULL); + platform_assert(cursor->cc != NULL); + platform_assert(extent_addr != NULL); + platform_assert(batch != NULL); + platform_assert(PAGE_TYPE_FIRST <= cursor->meta_type + && cursor->meta_type < NUM_PAGE_TYPES); + while (TRUE) { if (cursor->meta_page == NULL) { if (cursor->meta_addr == 0) { return MINI_META_CURSOR_END; } - // Non-blocking: if the meta page isn't resident, kick off a single-page - // prefetch and let the caller retry later. - cursor->meta_page = cache_get( - cursor->cc, cursor->meta_addr, FALSE, cursor->meta_type); + // Non-blocking: if the meta page isn't resident, kick off a + // single-page prefetch and let the caller retry later. + cursor->meta_page = + cache_get(cursor->cc, cursor->meta_addr, FALSE, cursor->meta_type); if (cursor->meta_page == NULL) { cache_prefetch_page( cursor->cc, cursor->meta_addr, cursor->meta_type); @@ -976,6 +1002,11 @@ mini_meta_cursor_status mini_meta_cursor_seek_extent(mini_meta_cursor *cursor, uint64 target_extent_addr) { + platform_assert(cursor != NULL); + platform_assert(cursor->cc != NULL); + platform_assert(target_extent_addr != 0); + platform_assert(target_extent_addr % cache_extent_size(cursor->cc) == 0); + uint64 extent_addr; uint64 batch; while (TRUE) { @@ -995,6 +1026,13 @@ mini_meta_cursor_prev(mini_meta_cursor *cursor, uint64 *extent_addr, uint64 *batch) { + platform_assert(cursor != NULL); + platform_assert(cursor->cc != NULL); + platform_assert(extent_addr != NULL); + platform_assert(batch != NULL); + platform_assert(PAGE_TYPE_FIRST <= cursor->meta_type + && cursor->meta_type < NUM_PAGE_TYPES); + while (TRUE) { if (cursor->meta_page == NULL) { return MINI_META_CURSOR_END; diff --git a/src/mini_allocator.h b/src/mini_allocator.h index fb15784d..4e35a5b8 100644 --- a/src/mini_allocator.h +++ b/src/mini_allocator.h @@ -50,7 +50,7 @@ typedef struct mini_allocator { // in each page it allocates, where that page's extent is listed in the meta // stream, so a prefetch cursor can start there without scanning from // meta_head. See mini_current_extent_meta_page(). - uint64 cur_extent_meta_page[MINI_MAX_BATCHES]; + uint64 cur_extent_meta_page[MINI_MAX_BATCHES]; } mini_allocator; uint64 @@ -114,11 +114,11 @@ void mini_prefetch(cache *cc, page_type type, uint64 meta_head); /* - * mini_meta_cursor: a forward cursor over the extent entries of a finalized - * mini_allocator, in allocation order. Entries from all batches are - * interleaved in the stream; the caller filters by batch as needed (each entry - * reports its batch). The btree iterator uses this to read extent addresses - * ahead of itself for prefetching. + * mini_meta_cursor: a non-blocking cursor over the extent entries of a + * finalized mini_allocator. Entries from all batches are interleaved in + * allocation order; the caller filters by batch as needed (each entry reports + * its batch). The btree iterator uses this to read extent addresses ahead of or + * behind itself for prefetching. * * The cursor holds a read reference on the meta page it is currently reading; * call mini_meta_cursor_deinit() to release it. The cursor is non-blocking: it @@ -139,7 +139,8 @@ typedef struct mini_meta_cursor { typedef enum mini_meta_cursor_status { MINI_META_CURSOR_ENTRY, // produced an entry MINI_META_CURSOR_END, // stream exhausted - MINI_META_CURSOR_WOULD_BLOCK, // next meta page not resident (prefetch issued) + MINI_META_CURSOR_WOULD_BLOCK, // next meta page not resident (prefetch + // issued) } mini_meta_cursor_status; void @@ -161,18 +162,18 @@ mini_meta_cursor_next(mini_meta_cursor *cursor, // Advance the cursor until it emits the entry for target_extent_addr, leaving // the cursor positioned just after it. Returns MINI_META_CURSOR_ENTRY if found, -// MINI_META_CURSOR_END if the stream ends first, or MINI_META_CURSOR_WOULD_BLOCK -// if a needed meta page is not yet resident. +// MINI_META_CURSOR_END if the stream ends first, or +// MINI_META_CURSOR_WOULD_BLOCK if a needed meta page is not yet resident. mini_meta_cursor_status mini_meta_cursor_seek_extent(mini_meta_cursor *cursor, uint64 target_extent_addr); // Emit the previous extent entry (reverse allocation order). The cursor must // have been positioned by mini_meta_cursor_seek_extent() or a prior call to -// mini_meta_cursor_prev() — calling on a freshly-initialized cursor returns END. -// Non-blocking: if the previous meta page isn't resident, issues a single-page -// prefetch and returns MINI_META_CURSOR_WOULD_BLOCK; the current page is kept -// alive so the retry can follow prev_meta_addr without re-reading. +// mini_meta_cursor_prev() — calling on a freshly-initialized cursor returns +// END. Non-blocking: if the previous meta page isn't resident, issues a +// single-page prefetch and returns MINI_META_CURSOR_WOULD_BLOCK; the current +// page is kept alive so the retry can follow prev_meta_addr without re-reading. mini_meta_cursor_status mini_meta_cursor_prev(mini_meta_cursor *cursor, uint64 *extent_addr, @@ -200,6 +201,10 @@ mini_meta_tail(mini_allocator *mini) static inline uint64 mini_current_extent_meta_page(mini_allocator *mini, uint64 batch) { + platform_assert(mini != NULL); + platform_assert(batch < mini->num_batches); + platform_assert(mini->cur_extent_meta_page[batch] != 0); + return mini->cur_extent_meta_page[batch]; } diff --git a/src/task.c b/src/task.c index dc298c14..f4e741db 100644 --- a/src/task.c +++ b/src/task.c @@ -40,9 +40,8 @@ task_tracker_list_init(task_tracker_list *list) void task_tracker_add(task_tracker *tracker) { - if (tracker != NULL) { - __sync_fetch_and_add(&tracker->outstanding, 1); - } + platform_assert(tracker != NULL); + __sync_fetch_and_add(&tracker->outstanding, 1); } static uint64 @@ -67,10 +66,6 @@ task_tracker_done(task_tracker *tracker, platform_status status, task_tracker_list *completed) { - if (tracker == NULL) { - return; - } - uint64 old_outstanding = tracker_done_common(tracker, status); if (old_outstanding == 1 && tracker->callback != NULL) { @@ -85,10 +80,6 @@ task_tracker_done(task_tracker *tracker, void task_tracker_done_but_not_last(task_tracker *tracker, platform_status status) { - if (tracker == NULL) { - return; - } - uint64 old_outstanding = tracker_done_common(tracker, status); platform_assert(1 < old_outstanding); } diff --git a/src/task.h b/src/task.h index df1e463e..cdb37886 100644 --- a/src/task.h +++ b/src/task.h @@ -58,6 +58,8 @@ typedef void (*task_tracker_callback)(task_tracker *tracker); * outstanding reference owned by the launcher. Call task_tracker_add() before * publishing each new unit of work, including follow-up work published by a * tracked task. Each unit must call task_tracker_done() exactly once. + * Pass a non-NULL tracker to the task_tracker_* functions; callers that + * support untracked work should check for NULL at the call site. * * The final caller of task_tracker_done() links the tracker onto the supplied * completion list. Call task_tracker_notify_all() after dropping any locks diff --git a/src/trunk.c b/src/trunk.c index f1f98fe6..a80f2867 100644 --- a/src/trunk.c +++ b/src/trunk.c @@ -19,6 +19,7 @@ #include "data_internal.h" #include "task.h" #include "notification.h" +#include "prefetch.h" #include "poison.h" typedef VECTOR(routing_filter) routing_filter_vector; @@ -2446,20 +2447,14 @@ trunk_branch_merger_build_merge_itor(trunk_branch_merger *merger, platform_assert(merger->merge_itor == NULL); // A compaction/leaf-split merge reads each input branch end to end, so give - // the branches deep prefetch from the shared budget (Little's law: split the - // read-ahead budget across the branches being merged). Min 2 mirrors the - // range-scan path (see core_prefetch_lookahead). + // the branches a soft share of the read-ahead budget. uint64 num_branches = vector_length(&merger->itors); - if (num_branches > 0 && merger->prefetch_budget > 0 && merger->cc != NULL) { - uint64 budget_extents = - merger->prefetch_budget / cache_extent_size(merger->cc); - uint64 lookahead = budget_extents / num_branches; - if (lookahead < 2) { - lookahead = 2; - } + uint32 lookahead = prefetch_budget_to_extent_lookahead( + merger->cc, merger->prefetch_budget, num_branches); + if (lookahead > 0) { for (uint64 i = 0; i < num_branches; i++) { btree_iterator *itor = (btree_iterator *)vector_get(&merger->itors, i); - btree_iterator_set_prefetch_lookahead(itor, (uint32)lookahead); + btree_iterator_set_prefetch_lookahead(itor, lookahead); } } @@ -2798,10 +2793,12 @@ bundle_compaction_destroy(bundle_compaction *compaction, PAGE_TYPE_BRANCH); } - task_tracker_done( - compaction->tracker, - bundle_compaction_notify_status(compaction, maplet_compaction_rc), - completed); + if (compaction->tracker != NULL) { + task_tracker_done( + compaction->tracker, + bundle_compaction_notify_status(compaction, maplet_compaction_rc), + completed); + } platform_free(context->hid, compaction); } @@ -4123,7 +4120,9 @@ enqueue_bundle_compaction(trunk_context *context, rc = STATUS_NO_MEMORY; goto next; } - task_tracker_add(tracker); + if (tracker != NULL) { + task_tracker_add(tracker); + } trunk_pivot_state_incref(state); @@ -4134,8 +4133,8 @@ enqueue_bundle_compaction(trunk_context *context, &bc->tsk, bundle_compaction_task, FALSE); - // Upon success, the trunk_pivot_state_incref and task_tracker_add are - // passed to the task + // Upon success, the trunk_pivot_state_incref and optional + // task_tracker_add are passed to the task. if (!SUCCESS(rc)) { trunk_pivot_state_decref(state); // undoes trunk_pivot_state_incref @@ -5746,7 +5745,9 @@ trunk_flush_cleanup(trunk_context *context, task_tracker *tracker) &context->tasks, context, tracker, &completed); incorporation_tasks_deinit(&context->tasks, context); trunk_modification_end(context); - task_tracker_done(tracker, rc, &completed); + if (tracker != NULL) { + task_tracker_done(tracker, rc, &completed); + } task_tracker_notify_all(&completed); } diff --git a/src/trunk.h b/src/trunk.h index ffa65c69..ee1e60a0 100644 --- a/src/trunk.h +++ b/src/trunk.h @@ -26,7 +26,7 @@ typedef struct trunk_config { uint64 incorporation_size_kv_bytes; uint64 target_fanout; uint64 branch_rough_count_height; - uint64 prefetch_budget; // bytes of read-ahead per merge (~BDP) + uint64 prefetch_budget; // soft read-ahead bytes per merge bool32 use_stats; } trunk_config; @@ -185,7 +185,7 @@ typedef struct trunk_branch_merger { platform_heap_id hid; const data_config *data_cfg; cache *cc; // for deep-prefetch budget sizing - uint64 prefetch_budget; // bytes of read-ahead across the merge + uint64 prefetch_budget; // soft read-ahead bytes across the merge key min_key; key max_key; uint64 height; diff --git a/tests/functional/btree_test.c b/tests/functional/btree_test.c index 86b8113f..c4e0e29b 100644 --- a/tests/functional/btree_test.c +++ b/tests/functional/btree_test.c @@ -373,7 +373,8 @@ test_btree_scan_once(cache *cc, uint64 *tuples_scanned, uint64 *logical_bytes_scanned) { - // prefetch_lookahead 0 => no prefetch; 1 => legacy single-extent; >=2 => deep + // prefetch_lookahead 0 => no prefetch; 1 => legacy single-extent; >=2 => + // deep bool32 do_prefetch = (prefetch_lookahead >= 1); btree_iterator itor; timestamp start_time = platform_get_timestamp(); diff --git a/tests/functional/scan_benchmark.c b/tests/functional/scan_benchmark.c index ab78531f..dc8f2a9e 100644 --- a/tests/functional/scan_benchmark.c +++ b/tests/functional/scan_benchmark.c @@ -495,9 +495,9 @@ scan_benchmark_load_database(const splinterdb_config *cfg, /* * Compaction-throughput benchmark: open an existing DB (cold cache) and time a - * single blocking full-leaf optimize over the whole key range. With data > cache - * and O_DIRECT, the branch reads done by compaction's merge iterators are cold, - * so this exercises the btree-iterator prefetch path under compaction. + * single blocking full-leaf optimize over the whole key range. With data > + * cache and O_DIRECT, the branch reads done by compaction's merge iterators are + * cold, so this exercises the btree-iterator prefetch path under compaction. */ static int scan_benchmark_run_optimize(const splinterdb_config *cfg) @@ -513,7 +513,8 @@ scan_benchmark_run_optimize(const splinterdb_config *cfg) splinterdb_notification notification; splinterdb_notification_init_blocking(¬ification); - platform_default_log("scan_benchmark: running blocking full-leaf optimize\n"); + platform_default_log( + "scan_benchmark: running blocking full-leaf optimize\n"); timestamp start_time = platform_get_timestamp(); rc = splinterdb_optimize(kvs, NULL_SLICE, NULL_SLICE, TRUE, ¬ification); uint64 elapsed_ns = platform_timestamp_elapsed(start_time); From a3a4d6bfd27e5c73f6040ab303e477969ace6141 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Thu, 25 Jun 2026 23:53:43 -0700 Subject: [PATCH 05/15] more cleanups Signed-off-by: Rob Johnson --- src/btree.c | 25 +---- src/btree.h | 18 ++-- src/core.c | 7 +- src/task.h | 2 - src/trunk.c | 191 +++++++++++++++++++-------------- src/trunk.h | 12 --- tests/functional/btree_test.c | 28 +++-- tests/unit/btree_stress_test.c | 9 +- 8 files changed, 153 insertions(+), 139 deletions(-) diff --git a/src/btree.c b/src/btree.c index 87e25831..997cbb26 100644 --- a/src/btree.c +++ b/src/btree.c @@ -3694,11 +3694,9 @@ btree_iterator_init(cache *cc, key start_key, bool32 do_prefetch, bool32 copy_nodes, - uint32 height) + uint32 height, + uint32 prefetch_lookahead) { - // The synchronous init defaults to the legacy single-extent-ahead prefetch. - // Callers wanting deep prefetch call btree_iterator_set_prefetch_lookahead() - // afterward (the async path threads a lookahead in directly). platform_status rc = btree_iterator_init_common(cc, cfg, itor, @@ -3712,7 +3710,7 @@ btree_iterator_init(cache *cc, do_prefetch, copy_nodes, height, - 1, + prefetch_lookahead, &start_key); if (!SUCCESS(rc)) { return rc; @@ -3736,20 +3734,6 @@ btree_iterator_init(cache *cc, return STATUS_OK; } -void -btree_iterator_set_prefetch_lookahead(btree_iterator *itor, - uint32 prefetch_lookahead) -{ - platform_assert(itor != NULL); - platform_assert(itor->do_prefetch); - - itor->prefetch.lookahead = prefetch_lookahead; - // Re-anchor the cursor at the current position with the new lookahead. This - // also issues an initial prefetch (replacing the legacy one-extent prefetch - // from init when the cursor engages). - btree_prefetch_cursor_start(itor, TRUE); -} - async_status btree_iterator_init_async(btree_iterator_async_state *state) { @@ -4210,7 +4194,8 @@ btree_count_in_range_by_iterator(cache *cc, min_key, TRUE, FALSE, - 0); + 0, + 1); platform_assert_status_ok(rc); memset(stats, 0, sizeof(*stats)); diff --git a/src/btree.h b/src/btree.h index 7629d0ad..2c695a4c 100644 --- a/src/btree.h +++ b/src/btree.h @@ -341,6 +341,11 @@ btree_lookup_and_merge_async(btree_lookup_async_state *state); async_status btree_lookup_async(btree_lookup_async_state *state); +/* + * prefetch_lookahead is measured in leaf extents. Values <= 1 use the legacy + * single-extent prefetch path; values >= 2 enable deep extent prefetch. Ignored + * unless do_prefetch is TRUE. + */ platform_status btree_iterator_init(cache *cc, const btree_config *cfg, @@ -355,17 +360,8 @@ btree_iterator_init(cache *cc, key start_key, bool32 do_prefetch, bool32 copy_nodes, - uint32 height); - -/* - * Set the extent-prefetch lookahead (in leaf extents) of an already-initialized - * iterator and re-anchor its prefetch cursor at the current position. A value - * >= 2 enables deep prefetch; <= 1 falls back to the legacy single-extent path. - * The iterator must have been initialized with do_prefetch == TRUE. - */ -void -btree_iterator_set_prefetch_lookahead(btree_iterator *itor, - uint32 prefetch_lookahead); + uint32 height, + uint32 prefetch_lookahead); // clang-format off DEFINE_ASYNC_STATE(btree_iterator_async_state, 5, diff --git a/src/core.c b/src/core.c index 4cf7ab3d..24394384 100644 --- a/src/core.c +++ b/src/core.c @@ -391,7 +391,8 @@ core_memtable_iterator_init(core_handle *spl, start_key, FALSE, FALSE, - 0); + 0, + 1); } static void @@ -1200,7 +1201,9 @@ core_range_iterator_init(core_handle *spl, } } uint32 deep_lookahead = prefetch_budget_to_extent_lookahead( - spl->cc, spl->cfg.prefetch_budget, n_prefetch_branches); + cache_extent_size(spl->cc), + spl->cfg.prefetch_budget, + n_prefetch_branches); uint64 started_inits = 0; for (uint64 i = 0; i < range_itor->num_branches; i++) { diff --git a/src/task.h b/src/task.h index cdb37886..df1e463e 100644 --- a/src/task.h +++ b/src/task.h @@ -58,8 +58,6 @@ typedef void (*task_tracker_callback)(task_tracker *tracker); * outstanding reference owned by the launcher. Call task_tracker_add() before * publishing each new unit of work, including follow-up work published by a * tracked task. Each unit must call task_tracker_done() exactly once. - * Pass a non-NULL tracker to the task_tracker_* functions; callers that - * support untracked work should check for NULL at the call site. * * The final caller of task_tracker_done() links the tracker onto the supplied * completion list. Call task_tracker_notify_all() after dropping any locks diff --git a/src/trunk.c b/src/trunk.c index a80f2867..a3a16e87 100644 --- a/src/trunk.c +++ b/src/trunk.c @@ -72,6 +72,17 @@ typedef enum bundle_compaction_phase { typedef VECTOR(trunk_branch_info) trunk_branch_info_vector; +typedef struct trunk_branch_merger { + platform_heap_id hid; + const trunk_config *cfg; + key min_key; + key max_key; + uint64 height; + trunk_branch_info_vector branches; + merge_iterator *merge_itor; + iterator_vector itors; +} trunk_branch_merger; + typedef struct bundle_compaction { struct bundle_compaction *next; task tsk; // bundle_comaction_task @@ -2308,67 +2319,42 @@ serialize_nodes(trunk_context *context, static void trunk_branch_merger_init(trunk_branch_merger *merger, platform_heap_id hid, - const data_config *data_cfg, - cache *cc, - uint64 prefetch_budget, + const trunk_config *cfg, key min_key, key max_key, uint64 height) { - merger->hid = hid; - merger->data_cfg = data_cfg; - merger->cc = cc; - merger->prefetch_budget = prefetch_budget; - merger->min_key = min_key; - merger->max_key = max_key; - merger->height = height; - merger->merge_itor = NULL; + platform_assert(cfg != NULL); + platform_assert(cfg->data_cfg != NULL); + platform_assert(cfg->btree_cfg != NULL); + platform_assert(cfg->btree_cfg->cache_cfg != NULL); + + merger->hid = hid; + merger->cfg = cfg; + merger->min_key = min_key; + merger->max_key = max_key; + merger->height = height; + merger->merge_itor = NULL; + vector_init(&merger->branches, hid); vector_init(&merger->itors, hid); } static platform_status trunk_branch_merger_add_branch(trunk_branch_merger *merger, - cache *cc, - const btree_config *btree_cfg, uint64 addr, page_type type) { - btree_iterator *iter = TYPED_MALLOC(merger->hid, iter); - if (iter == NULL) { - platform_error_log( - "%s():%d: platform_malloc() failed", __func__, __LINE__); - return STATUS_NO_MEMORY; - } - platform_status rc = btree_iterator_init(cc, - btree_cfg, - iter, - addr, - type, - greater_than_or_equal, - merger->min_key, - less_than, - merger->max_key, - greater_than_or_equal, - merger->min_key, - TRUE, - FALSE, - merger->height); - if (!SUCCESS(rc)) { - platform_error_log("%s():%d: btree_iterator_init() failed: %s", - __func__, - __LINE__, - platform_status_to_string(rc)); - platform_free(merger->hid, iter); - return rc; - } - rc = vector_append(&merger->itors, (iterator *)iter); + platform_assert(merger != NULL); + platform_assert(addr != 0); + platform_assert(PAGE_TYPE_FIRST <= type && type < NUM_PAGE_TYPES); + + trunk_branch_info branch = {addr, type}; + platform_status rc = vector_append(&merger->branches, branch); if (!SUCCESS(rc)) { platform_error_log("%s():%d: vector_append() failed: %s", __func__, __LINE__, platform_status_to_string(rc)); - btree_iterator_deinit(iter); - platform_free(merger->hid, iter); } return rc; } @@ -2376,13 +2362,14 @@ trunk_branch_merger_add_branch(trunk_branch_merger *merger, static platform_status trunk_branch_merger_add_branches(trunk_branch_merger *merger, - cache *cc, - const btree_config *btree_cfg, uint64 num_branches, const trunk_branch_info *branches) { + platform_assert(merger != NULL); + platform_assert(branches != NULL || num_branches == 0); + platform_status rc = vector_ensure_capacity( - &merger->itors, vector_length(&merger->itors) + num_branches); + &merger->branches, vector_length(&merger->branches) + num_branches); if (!SUCCESS(rc)) { platform_error_log("%s():%d: vector_ensure_capacity() failed: %s", __func__, @@ -2393,7 +2380,7 @@ trunk_branch_merger_add_branches(trunk_branch_merger *merger, for (uint64 i = 0; i < num_branches; i++) { rc = trunk_branch_merger_add_branch( - merger, cc, btree_cfg, branches[i].addr, branches[i].type); + merger, branches[i].addr, branches[i].type); if (!SUCCESS(rc)) { platform_error_log("%s():%d: btree_merger_add_branch() failed: %s", __func__, @@ -2407,13 +2394,14 @@ trunk_branch_merger_add_branches(trunk_branch_merger *merger, static platform_status trunk_branch_merger_add_bundle(trunk_branch_merger *merger, - cache *cc, - const btree_config *btree_cfg, const bundle *routed) { + platform_assert(merger != NULL); + platform_assert(routed != NULL); + platform_status rc = vector_ensure_capacity( - &merger->itors, - vector_length(&merger->itors) + bundle_num_branches(routed)); + &merger->branches, + vector_length(&merger->branches) + bundle_num_branches(routed)); if (!SUCCESS(rc)) { platform_error_log("%s():%d: vector_ensure_capacity() failed: %s", __func__, @@ -2425,8 +2413,6 @@ trunk_branch_merger_add_bundle(trunk_branch_merger *merger, for (uint64 i = 0; i < bundle_num_branches(routed); i++) { branch_ref bref = vector_get(&routed->branches, i); rc = trunk_branch_merger_add_branch(merger, - cc, - btree_cfg, branch_ref_addr(bref), bundle_branch_type(routed)); if (!SUCCESS(rc)) { @@ -2442,24 +2428,79 @@ trunk_branch_merger_add_bundle(trunk_branch_merger *merger, static platform_status trunk_branch_merger_build_merge_itor(trunk_branch_merger *merger, + cache *cc, merge_behavior merge_mode) { + platform_assert(merger != NULL); + platform_assert(cc != NULL); platform_assert(merger->merge_itor == NULL); // A compaction/leaf-split merge reads each input branch end to end, so give // the branches a soft share of the read-ahead budget. - uint64 num_branches = vector_length(&merger->itors); + uint64 num_branches = vector_length(&merger->branches); + uint64 extent_size = cache_config_extent_size( + merger->cfg->btree_cfg->cache_cfg); uint32 lookahead = prefetch_budget_to_extent_lookahead( - merger->cc, merger->prefetch_budget, num_branches); - if (lookahead > 0) { - for (uint64 i = 0; i < num_branches; i++) { - btree_iterator *itor = (btree_iterator *)vector_get(&merger->itors, i); - btree_iterator_set_prefetch_lookahead(itor, lookahead); + extent_size, merger->cfg->prefetch_budget, num_branches); + if (lookahead == 0) { + lookahead = 1; + } + + platform_status rc = vector_ensure_capacity(&merger->itors, num_branches); + if (!SUCCESS(rc)) { + platform_error_log("%s():%d: vector_ensure_capacity() failed: %s", + __func__, + __LINE__, + platform_status_to_string(rc)); + return rc; + } + + for (uint64 i = 0; i < num_branches; i++) { + trunk_branch_info branch = vector_get(&merger->branches, i); + btree_iterator *itor = TYPED_MALLOC(merger->hid, itor); + if (itor == NULL) { + platform_error_log( + "%s():%d: platform_malloc() failed", __func__, __LINE__); + return STATUS_NO_MEMORY; + } + rc = btree_iterator_init(cc, + merger->cfg->btree_cfg, + itor, + branch.addr, + branch.type, + greater_than_or_equal, + merger->min_key, + less_than, + merger->max_key, + greater_than_or_equal, + merger->min_key, + TRUE, + FALSE, + merger->height, + lookahead); + if (!SUCCESS(rc)) { + platform_error_log("%s():%d: btree_iterator_init() failed: %s", + __func__, + __LINE__, + platform_status_to_string(rc)); + platform_free(merger->hid, itor); + return rc; + } + + rc = vector_append(&merger->itors, (iterator *)itor); + if (!SUCCESS(rc)) { + platform_error_log("%s():%d: vector_append() failed: %s", + __func__, + __LINE__, + platform_status_to_string(rc)); + btree_iterator_deinit(itor); + platform_free(merger->hid, itor); + return rc; } } return merge_iterator_create(merger->hid, - merger->data_cfg, + merger->cfg->data_cfg, num_branches, vector_data(&merger->itors), merge_mode, @@ -2470,7 +2511,7 @@ trunk_branch_merger_build_merge_itor(trunk_branch_merger *merger, static platform_status trunk_branch_merger_deinit(trunk_branch_merger *merger) { - platform_status rc; + platform_status rc = STATUS_OK; if (merger->merge_itor != NULL) { rc = merge_iterator_destroy(merger->hid, &merger->merge_itor); } @@ -2481,6 +2522,7 @@ trunk_branch_merger_deinit(trunk_branch_merger *merger) platform_free(merger->hid, itor); } vector_deinit(&merger->itors); + vector_deinit(&merger->branches); return rc; } @@ -3962,15 +4004,11 @@ bundle_compaction_task(task *arg) trunk_branch_merger merger; trunk_branch_merger_init(&merger, PROCESS_PRIVATE_HEAP_ID, - context->cfg->data_cfg, - context->cc, - context->cfg->prefetch_budget, + context->cfg, key_buffer_key(&state->key), key_buffer_key(&state->ubkey), 0); rc = trunk_branch_merger_add_branches(&merger, - context->cc, - context->cfg->btree_cfg, vector_length(&bc->input.branches), vector_data(&bc->input.branches)); if (!SUCCESS(rc)) { @@ -3997,7 +4035,9 @@ bundle_compaction_task(task *arg) goto cleanup_branch_merger; } - rc = trunk_branch_merger_build_merge_itor(&merger, bc->input.merge_mode); + rc = trunk_branch_merger_build_merge_itor(&merger, + context->cc, + bc->input.merge_mode); if (!SUCCESS(rc)) { platform_error_log( "branch_merger_build_merge_itor failed for state: %p bc: %p: %s\n", @@ -4571,17 +4611,13 @@ leaf_split_select_pivots(trunk_context *context, trunk_branch_merger merger; trunk_branch_merger_init(&merger, PROCESS_PRIVATE_HEAP_ID, - context->cfg->data_cfg, - context->cc, - context->cfg->prefetch_budget, + context->cfg, min_key, max_key, context->cfg->branch_rough_count_height); - rc = trunk_branch_merger_add_bundle(&merger, - context->cc, - context->cfg->btree_cfg, - vector_get_ptr(&leaf->pivot_bundles, 0)); + rc = trunk_branch_merger_add_bundle( + &merger, vector_get_ptr(&leaf->pivot_bundles, 0)); if (!SUCCESS(rc)) { platform_error_log("leaf_split_select_pivots: " "branch_merger_add_bundle failed: %d\n", @@ -4594,8 +4630,7 @@ leaf_split_select_pivots(trunk_context *context, bundle_num++) { bundle *bndl = vector_get_ptr(&leaf->inflight_bundles, bundle_num); - rc = trunk_branch_merger_add_bundle( - &merger, context->cc, context->cfg->btree_cfg, bndl); + rc = trunk_branch_merger_add_bundle(&merger, bndl); if (!SUCCESS(rc)) { platform_error_log("leaf_split_select_pivots: " "branch_merger_add_bundle failed: %d\n", @@ -4604,7 +4639,7 @@ leaf_split_select_pivots(trunk_context *context, } } - rc = trunk_branch_merger_build_merge_itor(&merger, MERGE_RAW); + rc = trunk_branch_merger_build_merge_itor(&merger, context->cc, MERGE_RAW); if (!SUCCESS(rc)) { platform_error_log("leaf_split_select_pivots: " "branch_merger_build_merge_itor failed: %d\n", diff --git a/src/trunk.h b/src/trunk.h index ee1e60a0..7df20d09 100644 --- a/src/trunk.h +++ b/src/trunk.h @@ -181,18 +181,6 @@ typedef struct trunk_ondisk_node_handle { page_handle *inflight_bundle_page; } trunk_ondisk_node_handle; -typedef struct trunk_branch_merger { - platform_heap_id hid; - const data_config *data_cfg; - cache *cc; // for deep-prefetch budget sizing - uint64 prefetch_budget; // soft read-ahead bytes across the merge - key min_key; - key max_key; - uint64 height; - merge_iterator *merge_itor; - iterator_vector itors; -} trunk_branch_merger; - /******************************** * Lifecycle ********************************/ diff --git a/tests/functional/btree_test.c b/tests/functional/btree_test.c index c4e0e29b..c465f150 100644 --- a/tests/functional/btree_test.c +++ b/tests/functional/btree_test.c @@ -391,10 +391,8 @@ test_btree_scan_once(cache *cc, min_key, do_prefetch, copy_nodes, - 0); - if (SUCCESS(rc) && do_prefetch) { - btree_iterator_set_prefetch_lookahead(&itor, prefetch_lookahead); - } + 0, + prefetch_lookahead); *init_elapsed_ns += platform_timestamp_elapsed(start_time); if (!SUCCESS(rc)) { return rc; @@ -693,7 +691,8 @@ test_btree_scan_perf(cache *cc, NEGATIVE_INFINITY_KEY, FALSE, FALSE, - 0); + 0, + 1); if (!SUCCESS(rc)) { goto out; } @@ -1090,7 +1089,8 @@ test_btree_basic(cache *cc, NEGATIVE_INFINITY_KEY, FALSE, FALSE, - 0); + 0, + 1); platform_assert_status_ok(rc); platform_default_log("btree iterator init time %luns\n", platform_timestamp_elapsed(start_time)); @@ -1275,7 +1275,8 @@ test_btree_create_packed_trees(cache *cc, NEGATIVE_INFINITY_KEY, FALSE, FALSE, - 0); + 0, + 1); platform_assert_status_ok(rc); btree_pack_req req; @@ -1333,7 +1334,8 @@ test_count_tuples_in_range(cache *cc, low_key, TRUE, FALSE, - 0); + 0, + 1); if (!SUCCESS(rc)) { return rc; } @@ -1432,7 +1434,8 @@ test_btree_print_all_keys(cache *cc, low_key, TRUE, FALSE, - 0); + 0, + 1); platform_assert_status_ok(rc); while (iterator_can_curr(&itor.super)) { key curr_key; @@ -1511,7 +1514,8 @@ test_btree_merge_basic(cache *cc, lo, TRUE, FALSE, - 0); + 0, + 1); platform_assert_status_ok(rc); itor_arr[tree_no] = &btree_itor_arr[tree_no].super; } @@ -1734,6 +1738,7 @@ test_btree_rough_iterator(cache *cc, NEGATIVE_INFINITY_KEY, TRUE, TRUE, + 1, 1); platform_assert_status_ok(rc); if (iterator_can_curr(&rough_btree_itor[tree_no].super)) { @@ -1900,7 +1905,8 @@ test_btree_merge_perf(cache *cc, min_key, TRUE, FALSE, - 0); + 0, + 1); platform_assert_status_ok(rc); itor_arr[tree_no] = &btree_itor_arr[tree_no].super; } diff --git a/tests/unit/btree_stress_test.c b/tests/unit/btree_stress_test.c index 0182208c..0ff2224f 100644 --- a/tests/unit/btree_stress_test.c +++ b/tests/unit/btree_stress_test.c @@ -709,7 +709,8 @@ iterator_tests(cache *cc, start_key, FALSE, FALSE, - 0); + 0, + 1); ASSERT_TRUE(SUCCESS(rc)); iterator *iter = (iterator *)&dbiter; @@ -762,7 +763,8 @@ iterator_seek_tests(cache *cc, start_key, FALSE, FALSE, - 0); + 0, + 1); ASSERT_TRUE(SUCCESS(rc)); iterator *iter = (iterator *)&dbiter; @@ -810,7 +812,8 @@ pack_tests(cache *cc, NEGATIVE_INFINITY_KEY, FALSE, FALSE, - 0); + 0, + 1); ASSERT_TRUE(SUCCESS(rc)); rc = STATUS_TEST_FAILED; From a9b04ae84421882ff80473b77b236b22ef3769c8 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Thu, 25 Jun 2026 23:56:05 -0700 Subject: [PATCH 06/15] add prefetch.h Signed-off-by: Rob Johnson --- src/prefetch.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 src/prefetch.h diff --git a/src/prefetch.h b/src/prefetch.h new file mode 100644 index 00000000..3706c621 --- /dev/null +++ b/src/prefetch.h @@ -0,0 +1,45 @@ +// Copyright 2026 VMware, Inc. +// SPDX-License-Identifier: Apache-2.0 + +/* + * prefetch.h -- + * + * Shared helpers for read-ahead policy. + */ + +#pragma once + +#include "platform_assert.h" + +/* + * Minimum deep-prefetch depth for one eligible stream. Keeping at least this + * many extents in flight is what makes deep prefetch worthwhile compared to the + * legacy single-extent-ahead path. + */ +#define PREFETCH_MIN_EXTENT_LOOKAHEAD (2) + +/* + * Convert a soft byte budget into a per-stream extent lookahead. The budget is + * divided across the streams, but each active stream gets at least + * PREFETCH_MIN_EXTENT_LOOKAHEAD extents. With many streams, that minimum can + * intentionally exceed the byte budget; the budget is a read-ahead target, not + * a hard cap. + */ +static inline uint32 +prefetch_budget_to_extent_lookahead(uint64 extent_size, + uint64 prefetch_budget, + uint64 num_streams) +{ + platform_assert(extent_size != 0); + + if (prefetch_budget == 0 || num_streams == 0) { + return 0; + } + + uint64 budget_extents = prefetch_budget / extent_size; + uint64 per_stream = budget_extents / num_streams; + if (per_stream < PREFETCH_MIN_EXTENT_LOOKAHEAD) { + per_stream = PREFETCH_MIN_EXTENT_LOOKAHEAD; + } + return (uint32)per_stream; +} From 3c280956960ba67b16d1daf4847d925687c2501e Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Thu, 25 Jun 2026 23:58:37 -0700 Subject: [PATCH 07/15] formatting Signed-off-by: Rob Johnson --- src/core.c | 8 ++++---- src/trunk.c | 20 +++++++++----------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/core.c b/src/core.c index 24394384..f663a9f9 100644 --- a/src/core.c +++ b/src/core.c @@ -1200,10 +1200,10 @@ core_range_iterator_init(core_handle *spl, n_prefetch_branches++; } } - uint32 deep_lookahead = prefetch_budget_to_extent_lookahead( - cache_extent_size(spl->cc), - spl->cfg.prefetch_budget, - n_prefetch_branches); + uint32 deep_lookahead = + prefetch_budget_to_extent_lookahead(cache_extent_size(spl->cc), + spl->cfg.prefetch_budget, + n_prefetch_branches); uint64 started_inits = 0; for (uint64 i = 0; i < range_itor->num_branches; i++) { diff --git a/src/trunk.c b/src/trunk.c index a3a16e87..c1b1d2e7 100644 --- a/src/trunk.c +++ b/src/trunk.c @@ -2412,9 +2412,8 @@ trunk_branch_merger_add_bundle(trunk_branch_merger *merger, for (uint64 i = 0; i < bundle_num_branches(routed); i++) { branch_ref bref = vector_get(&routed->branches, i); - rc = trunk_branch_merger_add_branch(merger, - branch_ref_addr(bref), - bundle_branch_type(routed)); + rc = trunk_branch_merger_add_branch( + merger, branch_ref_addr(bref), bundle_branch_type(routed)); if (!SUCCESS(rc)) { platform_error_log("%s():%d: btree_merger_add_branch() failed: %s", __func__, @@ -2438,9 +2437,9 @@ trunk_branch_merger_build_merge_itor(trunk_branch_merger *merger, // A compaction/leaf-split merge reads each input branch end to end, so give // the branches a soft share of the read-ahead budget. uint64 num_branches = vector_length(&merger->branches); - uint64 extent_size = cache_config_extent_size( - merger->cfg->btree_cfg->cache_cfg); - uint32 lookahead = prefetch_budget_to_extent_lookahead( + uint64 extent_size = + cache_config_extent_size(merger->cfg->btree_cfg->cache_cfg); + uint32 lookahead = prefetch_budget_to_extent_lookahead( extent_size, merger->cfg->prefetch_budget, num_branches); if (lookahead == 0) { lookahead = 1; @@ -4035,9 +4034,8 @@ bundle_compaction_task(task *arg) goto cleanup_branch_merger; } - rc = trunk_branch_merger_build_merge_itor(&merger, - context->cc, - bc->input.merge_mode); + rc = trunk_branch_merger_build_merge_itor( + &merger, context->cc, bc->input.merge_mode); if (!SUCCESS(rc)) { platform_error_log( "branch_merger_build_merge_itor failed for state: %p bc: %p: %s\n", @@ -4616,8 +4614,8 @@ leaf_split_select_pivots(trunk_context *context, max_key, context->cfg->branch_rough_count_height); - rc = trunk_branch_merger_add_bundle( - &merger, vector_get_ptr(&leaf->pivot_bundles, 0)); + rc = trunk_branch_merger_add_bundle(&merger, + vector_get_ptr(&leaf->pivot_bundles, 0)); if (!SUCCESS(rc)) { platform_error_log("leaf_split_select_pivots: " "branch_merger_add_bundle failed: %d\n", From ccb19d4a974a718f284a99371c3532e2fd1f198f Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Fri, 26 Jun 2026 12:55:24 -0700 Subject: [PATCH 08/15] cleanups Signed-off-by: Rob Johnson --- src/btree.c | 262 +++++++++++++++++++++++---------- src/btree.h | 16 +- src/btree_private.h | 136 ++++++++++++++++- src/core.c | 10 +- src/prefetch.h | 4 +- src/trunk.c | 1 - tests/functional/btree_test.c | 22 +-- tests/unit/btree_stress_test.c | 9 +- tests/unit/btree_test.c | 5 +- 9 files changed, 332 insertions(+), 133 deletions(-) diff --git a/src/btree.c b/src/btree.c index 997cbb26..01f48c33 100644 --- a/src/btree.c +++ b/src/btree.c @@ -2661,13 +2661,15 @@ static inline uint64 btree_iterator_curr_live_prev_addr(btree_iterator *itor) { if (!btree_iterator_curr_is_copy(itor)) { - return itor->curr.hdr->prev_addr; + return btree_hdr_prev_addr( + itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type); } btree_node live_curr; live_curr.addr = itor->curr.addr; btree_node_get(itor->cc, itor->cfg, &live_curr, itor->page_type); - uint64 prev_addr = live_curr.hdr->prev_addr; + uint64 prev_addr = + btree_hdr_prev_addr(itor->cfg, live_curr.hdr, live_curr.addr, itor->page_type); btree_node_unget(itor->cc, itor->cfg, &live_curr); return prev_addr; } @@ -2686,16 +2688,25 @@ btree_iterator_end_key_beyond_curr(btree_iterator *itor) uint64 num_entries = btree_num_entries(itor->curr.hdr); if (key_is_positive_infinity(itor->max_key)) { - return itor->curr.hdr->next_addr != 0; + return btree_hdr_next_addr( + itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) + != 0; } if (num_entries == 0 || itor->height > btree_height(itor->curr.hdr)) { - return num_entries == 0 && itor->curr.hdr->next_addr != 0; + return num_entries == 0 + && btree_hdr_next_addr(itor->cfg, + itor->curr.hdr, + itor->curr.addr, + itor->page_type) + != 0; } key last_key = btree_iterator_get_node_key(itor, itor->curr.hdr, num_entries - 1); return btree_key_compare(itor->cfg, itor->max_key, last_key) > 0 - && itor->curr.hdr->next_addr != 0; + && btree_hdr_next_addr( + itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) + != 0; } static void @@ -2927,7 +2938,7 @@ btree_prefetch_cursor_pump(btree_iterator *itor) uint64 meta_page_addr = itor->curr.hdr->meta_page_addr; if (meta_page_addr == 0) { - // Node predates the stamp, or a tiny tree: fall back to legacy. + // No extent list is available; use the header extent links. pf->state = BTREE_PREFETCH_DISABLED; return FALSE; } @@ -2942,7 +2953,7 @@ btree_prefetch_cursor_pump(btree_iterator *itor) return FALSE; } if (status != MINI_META_CURSOR_ENTRY) { - // Couldn't locate our extent from this stamp; fall back to legacy. + // No extent list entry is available; use the header extent links. mini_meta_cursor_deinit(&pf->meta_cursor); pf->state = BTREE_PREFETCH_DISABLED; return FALSE; @@ -2984,11 +2995,17 @@ btree_prefetch_cursor_on_boundary(btree_iterator *itor) btree_prefetch_cursor_fill(itor); } +static inline bool32 +btree_iterator_prefetch_enabled(btree_iterator *itor) +{ + return itor->prefetch.lookahead > 0; +} + /* * (Re)start deep prefetch at the iterator's current leaf. Non-blocking: kicks * off meta-page IO and leaves the cursor PRIMING unless the meta page is - * already resident. Falls back to DISABLED (legacy single-extent prefetch) when - * deep prefetch does not apply. + * already resident. Leaves the deep cursor DISABLED when deep prefetch does not + * apply; the header extent links may still prefetch one extent. */ static void btree_prefetch_cursor_start(btree_iterator *itor, bool32 going_forward) @@ -3004,7 +3021,7 @@ btree_prefetch_cursor_start(btree_iterator *itor, bool32 going_forward) pf->leaf_batch = NUM_BLOB_BATCHES + itor->height; pf->prefetch_blobs = (itor->height == 0); - if (!itor->do_prefetch || pf->lookahead <= 1 + if (!btree_iterator_prefetch_enabled(itor) || pf->lookahead <= 1 || itor->page_type != PAGE_TYPE_BRANCH || itor->curr.page == NULL) { return; @@ -3026,9 +3043,10 @@ btree_prefetch_cursor_deinit(btree_iterator *itor) /* * Drive prefetching after the iterator advances one leaf, in either direction. * On a direction change, restarts the cursor in the new direction (resetting - * the ramp). Otherwise pumps the non-blocking prime while PRIMING, then, on an - * extent-boundary crossing, refills the deep window (ACTIVE) or issues the - * legacy single-extent prefetch (forward only; no prev_extent_addr in headers). + * the ramp). Otherwise pumps the non-blocking prime while PRIMING. Deep + * prefetch refills after an extent-boundary crossing; header-link prefetch runs + * after a crossing or direction restart, when the adjacent extent link is useful + * and not just a duplicate prefetch for the current extent. */ static void btree_iterator_prefetch_on_advance(btree_iterator *itor, @@ -3037,33 +3055,41 @@ btree_iterator_prefetch_on_advance(btree_iterator *itor, { cache *cc = itor->cc; btree_prefetch_cursor *pf = &itor->prefetch; + bool32 restarted = FALSE; // Direction change: restart cursor in the new direction, resetting ramp. if (pf->state != BTREE_PREFETCH_DISABLED && pf->going_forward != going_forward) { btree_prefetch_cursor_start(itor, going_forward); - return; + restarted = TRUE; } bool32 positioned_now = FALSE; if (pf->state == BTREE_PREFETCH_PRIMING) { positioned_now = btree_prefetch_cursor_pump(itor); } - if (btree_addrs_share_extent(cc, last_addr, itor->curr.addr)) { - return; - } + bool32 crossed_extent = !btree_addrs_share_extent(cc, last_addr, itor->curr.addr); if (pf->state == BTREE_PREFETCH_ACTIVE) { - if (!positioned_now) { + if (crossed_extent && !positioned_now && !restarted) { btree_prefetch_cursor_on_boundary(itor); } - } else if (going_forward && itor->do_prefetch - && itor->curr.hdr->next_extent_addr != 0 - && !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr)) + } else if (btree_iterator_prefetch_enabled(itor) + && (crossed_extent || restarted)) { - cache_prefetch(cc, itor->curr.hdr->next_extent_addr, itor->page_type); - } else if (!going_forward) { - // No legacy backward prefetch: leaf headers have no prev_extent_addr. + uint64 extent_addr = + going_forward + ? btree_hdr_next_extent_addr( + itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) + : btree_hdr_prev_extent_addr( + itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type); + + if (extent_addr != 0 + && (!going_forward + || !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr))) + { + cache_prefetch(cc, extent_addr, itor->page_type); + } } } @@ -3077,7 +3103,8 @@ static void btree_iterator_next_leaf(btree_iterator *itor) { uint64 last_addr = itor->curr.addr; - uint64 next_addr = itor->curr.hdr->next_addr; + uint64 next_addr = + btree_hdr_next_addr(itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type); btree_iterator_release_curr(itor); btree_iterator_get_curr_addr(itor, next_addr); itor->idx = 0; @@ -3092,7 +3119,10 @@ btree_iterator_next_leaf_async(btree_iterator_async_state *state, uint64 depth) async_begin(state, depth); state->last_addr = state->itor->curr.addr; - state->next_addr = state->itor->curr.hdr->next_addr; + state->next_addr = btree_hdr_next_addr(state->itor->cfg, + state->itor->curr.hdr, + state->itor->curr.addr, + state->itor->page_type); btree_iterator_release_curr(state->itor); state->itor->curr.addr = state->next_addr; @@ -3112,8 +3142,7 @@ btree_iterator_next_leaf_async(btree_iterator_async_state *state, uint64 depth) state->itor->idx = 0; state->itor->curr_min_idx = -1; - // Prefetching (pump + boundary refill / legacy) is all non-blocking, so it - // needs no awaits here. + // Prefetching is all non-blocking, so it needs no awaits here. btree_iterator_prefetch_on_advance(state->itor, state->last_addr, TRUE); async_return(state); @@ -3143,8 +3172,12 @@ btree_iterator_prev_leaf(btree_iterator *itor) * old curr node and the new one. In this case, we can just walk * forward until we find the leaf whose successor is our old leaf. */ - while (itor->curr.hdr->next_addr != last_addr) { - uint64 next_addr = itor->curr.hdr->next_addr; + while (btree_hdr_next_addr( + cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) + != last_addr) + { + uint64 next_addr = + btree_hdr_next_addr(cfg, itor->curr.hdr, itor->curr.addr, itor->page_type); btree_iterator_release_curr(itor); btree_iterator_get_curr_addr(itor, next_addr); } @@ -3160,7 +3193,10 @@ btree_iterator_prev_leaf(btree_iterator *itor) itor->curr_min_idx = find_key_in_node( itor, itor->curr.hdr, itor->min_key, itor->min_key_comparison, NULL); } - if (itor->curr.hdr->prev_addr == 0 && itor->curr_min_idx == -1) { + if (btree_hdr_prev_addr(cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) + == 0 + && itor->curr_min_idx == -1) + { itor->curr_min_idx = 0; } @@ -3186,10 +3222,16 @@ btree_iterator_prev_leaf_async(btree_iterator_async_state *state, uint64 depth) state->live_curr.page = cache_get_async_state_result(&state->cache_get_state); state->live_curr.hdr = (btree_hdr *)state->live_curr.page->data; - state->prev_addr = state->live_curr.hdr->prev_addr; + state->prev_addr = btree_hdr_prev_addr(state->itor->cfg, + state->live_curr.hdr, + state->live_curr.addr, + state->itor->page_type); btree_node_unget(state->itor->cc, state->itor->cfg, &state->live_curr); } else { - state->prev_addr = state->itor->curr.hdr->prev_addr; + state->prev_addr = btree_hdr_prev_addr(state->itor->cfg, + state->itor->curr.hdr, + state->itor->curr.addr, + state->itor->page_type); } btree_iterator_release_curr(state->itor); state->itor->curr.addr = state->prev_addr; @@ -3212,8 +3254,16 @@ btree_iterator_prev_leaf_async(btree_iterator_async_state *state, uint64 depth) * old curr node and the new one. In this case, we can just walk * forward until we find the leaf whose successor is our old leaf. */ - while (state->itor->curr.hdr->next_addr != state->curr_addr) { - state->next_addr = state->itor->curr.hdr->next_addr; + while (btree_hdr_next_addr(state->itor->cfg, + state->itor->curr.hdr, + state->itor->curr.addr, + state->itor->page_type) + != state->curr_addr) + { + state->next_addr = btree_hdr_next_addr(state->itor->cfg, + state->itor->curr.hdr, + state->itor->curr.addr, + state->itor->page_type); btree_iterator_release_curr(state->itor); state->itor->curr.addr = state->next_addr; @@ -3251,7 +3301,12 @@ btree_iterator_prev_leaf_async(btree_iterator_async_state *state, uint64 depth) state->itor->min_key_comparison, NULL); } - if (state->itor->curr.hdr->prev_addr == 0 && state->itor->curr_min_idx == -1) + if (btree_hdr_prev_addr(state->itor->cfg, + state->itor->curr.hdr, + state->itor->curr.addr, + state->itor->page_type) + == 0 + && state->itor->curr_min_idx == -1) { state->itor->curr_min_idx = 0; } @@ -3416,7 +3471,11 @@ find_btree_node_and_get_idx_bounds(btree_iterator *itor, itor->curr_min_idx = !found && tmp == 0 ? tmp - 1 : tmp; // if min_key is not within the current node but there is no previous node // then set curr_min_idx to 0 - if (itor->curr_min_idx == -1 && itor->curr.hdr->prev_addr == 0) { + if (itor->curr_min_idx == -1 + && btree_hdr_prev_addr( + itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) + == 0) + { itor->curr_min_idx = 0; } @@ -3506,7 +3565,12 @@ find_btree_node_and_get_idx_bounds_async(btree_iterator_async_state *state, !state->found && state->tmp == 0 ? state->tmp - 1 : state->tmp; // if min_key is not within the current node but there is no previous node // then set curr_min_idx to 0 - if (state->itor->curr_min_idx == -1 && state->itor->curr.hdr->prev_addr == 0) + if (state->itor->curr_min_idx == -1 + && btree_hdr_prev_addr(state->itor->cfg, + state->itor->curr.hdr, + state->itor->curr.addr, + state->itor->page_type) + == 0) { state->itor->curr_min_idx = 0; } @@ -3624,7 +3688,6 @@ btree_iterator_init_common(cache *cc, comparison max_key_comparison, key max_key, key start_key, - bool32 do_prefetch, bool32 copy_nodes, uint32 height, uint32 prefetch_lookahead, @@ -3657,7 +3720,6 @@ btree_iterator_init_common(cache *cc, itor->cc = cc; itor->cfg = cfg; itor->root_addr = root_addr; - itor->do_prefetch = do_prefetch; itor->height = height; itor->copy_nodes = copy_nodes; itor->min_key_comparison = min_key_comparison; @@ -3692,7 +3754,6 @@ btree_iterator_init(cache *cc, key max_key, comparison start_type, key start_key, - bool32 do_prefetch, bool32 copy_nodes, uint32 height, uint32 prefetch_lookahead) @@ -3707,7 +3768,6 @@ btree_iterator_init(cache *cc, max_key_comparison, max_key, start_key, - do_prefetch, copy_nodes, height, prefetch_lookahead, @@ -3719,13 +3779,15 @@ btree_iterator_init(cache *cc, find_btree_node_and_get_idx_bounds(itor, start_key, start_type); btree_prefetch_cursor_start(itor, TRUE); - // While the deep cursor is priming (or disabled), cover the next extent with - // the legacy single-extent-ahead prefetch via next_extent_addr. - if (itor->prefetch.state != BTREE_PREFETCH_ACTIVE && itor->do_prefetch - && itor->curr.hdr->next_extent_addr != 0 + // While the deep cursor is priming or disabled, keep the next forward extent + // warm when the leaf header names one. + uint64 next_extent_addr = btree_hdr_next_extent_addr( + itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type); + if (itor->prefetch.state != BTREE_PREFETCH_ACTIVE + && btree_iterator_prefetch_enabled(itor) && next_extent_addr != 0 && !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr)) { - cache_prefetch(cc, itor->curr.hdr->next_extent_addr, itor->page_type); + cache_prefetch(cc, next_extent_addr, itor->page_type); } debug_assert(!iterator_can_curr((iterator *)itor) @@ -3749,7 +3811,6 @@ btree_iterator_init_async(btree_iterator_async_state *state) state->max_key_comparison, state->max_key, state->start_key, - state->do_prefetch, state->copy_nodes, state->height, state->prefetch_lookahead, @@ -3763,17 +3824,18 @@ btree_iterator_init_async(btree_iterator_async_state *state) btree_iterator_copy_curr_if_needed(state->itor); btree_prefetch_cursor_start(state->itor, TRUE); - // While the deep cursor is priming (or disabled), cover the next extent with - // the legacy single-extent-ahead prefetch via next_extent_addr. + // While the deep cursor is priming or disabled, keep the next forward extent + // warm when the leaf header names one. + uint64 next_extent_addr = btree_hdr_next_extent_addr(state->itor->cfg, + state->itor->curr.hdr, + state->itor->curr.addr, + state->itor->page_type); if (state->itor->prefetch.state != BTREE_PREFETCH_ACTIVE - && state->itor->do_prefetch - && state->itor->curr.hdr->next_extent_addr != 0 + && btree_iterator_prefetch_enabled(state->itor) && next_extent_addr != 0 && !btree_addrs_share_extent( state->cc, state->itor->curr.addr, state->itor->end_addr)) { - cache_prefetch(state->cc, - state->itor->curr.hdr->next_extent_addr, - state->itor->page_type); + cache_prefetch(state->cc, next_extent_addr, state->itor->page_type); } debug_assert(!iterator_can_curr((iterator *)state->itor) @@ -3804,16 +3866,14 @@ btree_iterator_deinit(btree_iterator *itor) * B-tree packing functions ****************************/ -// generation number isn't used in packed btrees +// Branch nodes use the header union for meta_page_addr. static inline void btree_pack_node_init_hdr(const btree_config *cfg, btree_hdr *hdr, - uint64 next_extent, uint8 height) { btree_init_hdr(cfg, hdr); - hdr->next_extent_addr = next_extent; - hdr->height = height; + hdr->height = height; } static inline void @@ -3823,6 +3883,7 @@ btree_pack_setup_start(btree_pack_req *req) ZERO_ARRAY(req->edge); ZERO_ARRAY(req->edge_stats); ZERO_ARRAY(req->num_edges); + ZERO_ARRAY(req->level_has_nodes); // we create a root here, but we won't build it with the rest // of the tree, we'll copy into it at the end @@ -3861,14 +3922,12 @@ btree_pack_create_next_node(btree_pack_req *req, uint64 height, key pivot); static inline void btree_pack_link_node(btree_pack_req *req, uint64 height, - uint64 offset, - uint64 next_extent_addr) + uint64 offset) { btree_node *edge = &req->edge[height][offset]; btree_pivot_stats *edge_stats = &req->edge_stats[height][offset]; key pivot = height ? btree_get_pivot(req->cfg, edge->hdr, 0) : btree_get_tuple_key(req->cfg, edge->hdr, 0); - edge->hdr->next_extent_addr = next_extent_addr; btree_node_unlock(req->cc, req->cfg, edge); btree_node_unclaim(req->cc, req->cfg, edge); // Cannot fully unlock edge yet because the key "pivot" may point into it. @@ -3900,10 +3959,17 @@ btree_pack_link_node(btree_pack_req *req, static inline void btree_pack_link_extent(btree_pack_req *req, uint64 height, + bool32 last_extent_in_level, uint64 next_extent_addr) { + debug_assert(0 < req->num_edges[height]); + if (last_extent_in_level) { + btree_hdr_set_last_in_level( + req->edge[height][req->num_edges[height] - 1].hdr); + } for (int i = 0; i < req->num_edges[height]; i++) { - btree_pack_link_node(req, height, i, next_extent_addr); + req->edge[height][i].hdr->next_addr = next_extent_addr; + btree_pack_link_node(req, height, i); } req->num_edges[height] = 0; } @@ -3919,19 +3985,26 @@ btree_pack_create_next_node(btree_pack_req *req, uint64 height, key pivot) &node_next_extent, PAGE_TYPE_BRANCH, &new_node); - btree_pack_node_init_hdr(req->cfg, new_node.hdr, 0, height); + btree_pack_node_init_hdr(req->cfg, new_node.hdr, height); // Record where this node's extent is listed in the mini_allocator meta // stream, so an iterator can position a prefetch cursor in O(1). Must come // after init_hdr, which zeroes the header. new_node.hdr->meta_page_addr = mini_current_extent_meta_page(&req->mini, NUM_BLOB_BATCHES + height); + if (!req->level_has_nodes[height]) { + btree_hdr_set_first_in_level(new_node.hdr); + } + if (0 < req->num_edges[height]) { btree_node *old_node = btree_pack_get_current_node(req, height); - old_node->hdr->next_addr = new_node.addr; - new_node.hdr->prev_addr = old_node->addr; + new_node.hdr->prev_addr = old_node->hdr->prev_addr; if (!btree_addrs_share_extent(req->cc, old_node->addr, new_node.addr)) { - btree_pack_link_extent(req, height, new_node.addr); + debug_assert(btree_page_is_last_in_extent(req->cfg, old_node->addr)); + debug_assert(btree_page_is_first_in_extent(req->cfg, new_node.addr)); + new_node.hdr->prev_addr = + btree_extent_base_addr(req->cc, old_node->addr); + btree_pack_link_extent(req, height, FALSE, new_node.addr); } } @@ -3941,6 +4014,7 @@ btree_pack_create_next_node(btree_pack_req *req, uint64 height, key pivot) req->edge[height][req->num_edges[height]] = new_node; req->num_edges[height]++; + req->level_has_nodes[height] = TRUE; debug_assert(btree_pack_get_current_node_stats(req, height)->num_kvs == 0); return &req->edge[height][req->num_edges[height] - 1]; } @@ -4028,18 +4102,21 @@ btree_pack_post_loop(btree_pack_req *req, key last_key) int h = 0; while (h < req->height || 1 < req->num_edges[h]) { - btree_pack_link_extent(req, h, 0); + btree_pack_link_extent(req, h, TRUE, 0); h++; } + btree_hdr_set_last_in_level(req->edge[req->height][0].hdr); root.addr = req->root_addr; btree_node_get(cc, cfg, &root, PAGE_TYPE_BRANCH); debug_only bool32 success = btree_node_claim(cc, cfg, &root); debug_assert(success); btree_node_lock(cc, cfg, &root); memmove(root.hdr, req->edge[req->height][0].hdr, btree_page_size(cfg)); - // fix the root next extent - root.hdr->next_extent_addr = 0; + root.hdr->prev_addr = 0; + root.hdr->next_addr = 0; + btree_hdr_set_first_in_level(root.hdr); + btree_hdr_set_last_in_level(root.hdr); btree_node_full_unlock(cc, cfg, &root); btree_node_full_unlock(cc, cfg, &req->edge[req->height][0]); @@ -4192,7 +4269,6 @@ btree_count_in_range_by_iterator(cache *cc, max_key, greater_than_or_equal, min_key, - TRUE, FALSE, 0, 1); @@ -4298,10 +4374,24 @@ btree_print_index_node(platform_log_handle *log_handle, log_handle, "** Page type: %s, INDEX NODE \n", page_type_str[type]); platform_log(log_handle, "** Header ptr: %p\n", hdr); platform_log(log_handle, "** addr: %lu \n", addr); - platform_log(log_handle, "** next_addr: %lu \n", hdr->next_addr); - platform_log( - log_handle, "** next_extent_addr: %lu \n", hdr->next_extent_addr); - platform_log(log_handle, "** generation: %lu \n", hdr->generation); + platform_log(log_handle, + "** prev_addr: %lu \n", + btree_hdr_prev_addr(cfg, hdr, addr, type)); + platform_log(log_handle, + "** next_addr: %lu \n", + btree_hdr_next_addr(cfg, hdr, addr, type)); + if (type == PAGE_TYPE_BRANCH) { + platform_log(log_handle, "** flags: %u \n", hdr->flags); + platform_log(log_handle, "** meta_page_addr: %lu \n", hdr->meta_page_addr); + platform_log(log_handle, + "** prev_extent_addr: %lu \n", + btree_hdr_prev_extent_addr(cfg, hdr, addr, type)); + platform_log(log_handle, + "** next_extent_addr: %lu \n", + btree_hdr_next_extent_addr(cfg, hdr, addr, type)); + } else { + platform_log(log_handle, "** generation: %lu \n", hdr->generation); + } platform_log(log_handle, "** height: %u \n", btree_height(hdr)); platform_log(log_handle, "** next_entry: %u \n", hdr->next_entry); platform_log(log_handle, "** num_entries: %u \n", btree_num_entries(hdr)); @@ -4345,10 +4435,24 @@ btree_print_leaf_node(platform_log_handle *log_handle, log_handle, "** Page type: %s, LEAF NODE \n", page_type_str[type]); platform_log(log_handle, "** hdrptr: %p\n", hdr); platform_log(log_handle, "** addr: %lu \n", addr); - platform_log(log_handle, "** next_addr: %lu \n", hdr->next_addr); - platform_log( - log_handle, "** next_extent_addr: %lu \n", hdr->next_extent_addr); - platform_log(log_handle, "** generation: %lu \n", hdr->generation); + platform_log(log_handle, + "** prev_addr: %lu \n", + btree_hdr_prev_addr(cfg, hdr, addr, type)); + platform_log(log_handle, + "** next_addr: %lu \n", + btree_hdr_next_addr(cfg, hdr, addr, type)); + if (type == PAGE_TYPE_BRANCH) { + platform_log(log_handle, "** flags: %u \n", hdr->flags); + platform_log(log_handle, "** meta_page_addr: %lu \n", hdr->meta_page_addr); + platform_log(log_handle, + "** prev_extent_addr: %lu \n", + btree_hdr_prev_extent_addr(cfg, hdr, addr, type)); + platform_log(log_handle, + "** next_extent_addr: %lu \n", + btree_hdr_next_extent_addr(cfg, hdr, addr, type)); + } else { + platform_log(log_handle, "** generation: %lu \n", hdr->generation); + } platform_log(log_handle, "** height: %u \n", btree_height(hdr)); platform_log(log_handle, "** next_entry: %u \n", hdr->next_entry); platform_log(log_handle, "** num_entries: %u \n", btree_num_entries(hdr)); diff --git a/src/btree.h b/src/btree.h index 2c695a4c..48795fb0 100644 --- a/src/btree.h +++ b/src/btree.h @@ -140,8 +140,8 @@ typedef struct ONDISK btree_pivot_data { * * Priming is non-blocking: the cursor's meta page is fetched lazily (PRIMING * state) so the iterator's async init never waits on it and the first tuple is - * not delayed. The legacy single-extent-ahead prefetch (via the leaf's - * next_extent_addr) covers the window until the cursor becomes ACTIVE. + * not delayed. The leaf header's extent links are used for prefetching until + * the cursor becomes ACTIVE. * * Depth ramps up (slow-start) from BTREE_PREFETCH_RAMP_MIN toward `lookahead` * as the scan proves long, so short scans don't waste bandwidth reading far @@ -149,7 +149,7 @@ typedef struct ONDISK btree_pivot_data { * scans get the same slow-start treatment. */ typedef enum btree_prefetch_state { - BTREE_PREFETCH_DISABLED = 0, // legacy next_extent_addr path / not applicable + BTREE_PREFETCH_DISABLED = 0, // deep prefetch inactive BTREE_PREFETCH_PRIMING, // meta-page IO kicked off; not yet positioned BTREE_PREFETCH_ACTIVE, // positioned; issuing deep prefetches } btree_prefetch_state; @@ -176,7 +176,6 @@ typedef struct btree_iterator { iterator super; cache *cc; const btree_config *cfg; - bool32 do_prefetch; uint32 height; page_type page_type; // Active memtable iterators copy nodes here and release page locks. @@ -212,6 +211,7 @@ typedef struct btree_pack_req { btree_node edge[BTREE_MAX_HEIGHT][MAX_PAGES_PER_EXTENT]; btree_pivot_stats edge_stats[BTREE_MAX_HEIGHT][MAX_PAGES_PER_EXTENT]; uint32 num_edges[BTREE_MAX_HEIGHT]; + bool32 level_has_nodes[BTREE_MAX_HEIGHT]; merge_accumulator blob_buffer; mini_allocator mini; @@ -342,9 +342,9 @@ async_status btree_lookup_async(btree_lookup_async_state *state); /* - * prefetch_lookahead is measured in leaf extents. Values <= 1 use the legacy - * single-extent prefetch path; values >= 2 enable deep extent prefetch. Ignored - * unless do_prefetch is TRUE. + * prefetch_lookahead is measured in leaf extents. 0 disables prefetch, 1 + * prefetches at most the next extent, and values >= 2 enable deep extent + * prefetch. */ platform_status btree_iterator_init(cache *cc, @@ -358,7 +358,6 @@ btree_iterator_init(cache *cc, key max_key, comparison start_type, key start_key, - bool32 do_prefetch, bool32 copy_nodes, uint32 height, uint32 prefetch_lookahead); @@ -376,7 +375,6 @@ DEFINE_ASYNC_STATE(btree_iterator_async_state, 5, param, key, max_key, param, comparison, start_type, param, key, start_key, - param, bool32, do_prefetch, param, bool32, copy_nodes, param, uint32, height, param, uint32, prefetch_lookahead, diff --git a/src/btree_private.h b/src/btree_private.h index 0d99375b..6f692eb2 100644 --- a/src/btree_private.h +++ b/src/btree_private.h @@ -32,21 +32,31 @@ typedef node_offset table_entry; * ************************************************************************* */ struct ONDISK btree_hdr { + /* + * Memtables store literal previous/next node addresses. Branches compute + * same-extent neighbors from the page address; every branch page in an + * extent stores the adjacent previous/next extent base addresses for its + * level. FIRST_IN_LEVEL and LAST_IN_LEVEL stop logical neighbor computation + * at level edges. + */ uint64 prev_addr; uint64 next_addr; - uint64 next_extent_addr; - // Address of the mini_allocator meta page that lists this node's extent. - // Stamped at pack time (see btree_pack_create_next_node); lets the - // bidirectional prefetch cursor jump straight to this node's position in the - // extent stream instead of scanning from meta_head. - uint64 meta_page_addr; - uint64 generation; + union { + // Branch: mini_allocator meta page that lists this node's extent. + uint64 meta_page_addr; + // Memtable: generation used to detect stale copied nodes. + uint64 generation; + }; uint8 height; + uint8 flags; node_offset next_entry; table_index num_entries; table_entry offsets[]; }; +#define BTREE_HDR_FIRST_IN_LEVEL (1 << 0) +#define BTREE_HDR_LAST_IN_LEVEL (1 << 1) + /* * ************************************************************************* * BTree Node index entries: Disk-resident structure @@ -172,6 +182,118 @@ btree_extent_size(const btree_config *cfg) return cache_config_extent_size(cfg->cache_cfg); } +static inline bool32 +btree_page_is_first_in_extent(const btree_config *cfg, uint64 addr) +{ + return addr % btree_extent_size(cfg) == 0; +} + +static inline bool32 +btree_page_is_last_in_extent(const btree_config *cfg, uint64 addr) +{ + return (addr + btree_page_size(cfg)) % btree_extent_size(cfg) == 0; +} + +static inline void +btree_hdr_set_first_in_level(btree_hdr *hdr) +{ + hdr->flags |= BTREE_HDR_FIRST_IN_LEVEL; +} + +static inline void +btree_hdr_set_last_in_level(btree_hdr *hdr) +{ + hdr->flags |= BTREE_HDR_LAST_IN_LEVEL; +} + +static inline bool32 +btree_hdr_is_first_in_level(const btree_hdr *hdr) +{ + return hdr->flags & BTREE_HDR_FIRST_IN_LEVEL; +} + +static inline bool32 +btree_hdr_is_last_in_level(const btree_hdr *hdr) +{ + return hdr->flags & BTREE_HDR_LAST_IN_LEVEL; +} + +static inline uint64 +btree_hdr_next_addr(const btree_config *cfg, + const btree_hdr *hdr, + uint64 addr, + page_type type) +{ + if (type == PAGE_TYPE_MEMTABLE) { + return hdr->next_addr; + } + platform_assert(type == PAGE_TYPE_BRANCH); + + if (btree_hdr_is_last_in_level(hdr)) { + return 0; + } + if (!btree_page_is_last_in_extent(cfg, addr)) { + return addr + btree_page_size(cfg); + } + debug_assert(hdr->next_addr != 0); + return hdr->next_addr; +} + +static inline uint64 +btree_hdr_prev_addr(const btree_config *cfg, + const btree_hdr *hdr, + uint64 addr, + page_type type) +{ + if (type == PAGE_TYPE_MEMTABLE) { + return hdr->prev_addr; + } + platform_assert(type == PAGE_TYPE_BRANCH); + + if (btree_hdr_is_first_in_level(hdr)) { + return 0; + } + if (!btree_page_is_first_in_extent(cfg, addr)) { + return addr - btree_page_size(cfg); + } + debug_assert(hdr->prev_addr != 0); + return hdr->prev_addr + btree_extent_size(cfg) - btree_page_size(cfg); +} + +static inline uint64 +btree_hdr_next_extent_addr(const btree_config *cfg, + const btree_hdr *hdr, + uint64 addr, + page_type type) +{ + (void)cfg; + (void)addr; + + if (type != PAGE_TYPE_BRANCH || btree_hdr_is_last_in_level(hdr) + || hdr->next_addr == 0) + { + return 0; + } + return hdr->next_addr; +} + +static inline uint64 +btree_hdr_prev_extent_addr(const btree_config *cfg, + const btree_hdr *hdr, + uint64 addr, + page_type type) +{ + (void)cfg; + (void)addr; + + if (type != PAGE_TYPE_BRANCH || btree_hdr_is_first_in_level(hdr) + || hdr->prev_addr == 0) + { + return 0; + } + return hdr->prev_addr; +} + static inline void btree_init_hdr(const btree_config *cfg, btree_hdr *hdr) { diff --git a/src/core.c b/src/core.c index f663a9f9..f1cf0ab1 100644 --- a/src/core.c +++ b/src/core.c @@ -390,9 +390,8 @@ core_memtable_iterator_init(core_handle *spl, start_key_comparison, start_key, FALSE, - FALSE, 0, - 1); + 0); } static void @@ -879,7 +878,6 @@ core_start_btree_iterator_init_async( key max_key, comparison start_key_comparison, key start_key, - bool32 do_prefetch, bool32 copy_nodes, uint32 prefetch_lookahead) { @@ -895,7 +893,6 @@ core_start_btree_iterator_init_async( max_key, start_key_comparison, start_key, - do_prefetch, copy_nodes, 0, prefetch_lookahead, @@ -1211,10 +1208,8 @@ core_range_iterator_init(core_handle *spl, btree_iterator *btree_itor = &range_itor->btree_itor[branch_no]; uint64 branch_addr = range_itor->branch[branch_no].addr; page_type page_type = range_itor->branch[branch_no].type; - bool32 do_prefetch = FALSE; - uint32 prefetch_lookahead = 1; + uint32 prefetch_lookahead = 0; if (range_itor->compacted[branch_no] && num_tuples > CORE_PREFETCH_MIN) { - do_prefetch = TRUE; prefetch_lookahead = deep_lookahead; } rc = core_start_btree_iterator_init_async( @@ -1229,7 +1224,6 @@ core_range_iterator_init(core_handle *spl, key_buffer_key(&range_itor->local_max_key), start_key_comparison, start_key, - do_prefetch, branch_no == 0 ? first_memtable_copy_nodes : FALSE, prefetch_lookahead); started_inits++; diff --git a/src/prefetch.h b/src/prefetch.h index 3706c621..329aa9dd 100644 --- a/src/prefetch.h +++ b/src/prefetch.h @@ -12,9 +12,7 @@ #include "platform_assert.h" /* - * Minimum deep-prefetch depth for one eligible stream. Keeping at least this - * many extents in flight is what makes deep prefetch worthwhile compared to the - * legacy single-extent-ahead path. + * Minimum deep-prefetch depth for one eligible stream. */ #define PREFETCH_MIN_EXTENT_LOOKAHEAD (2) diff --git a/src/trunk.c b/src/trunk.c index c1b1d2e7..133139a5 100644 --- a/src/trunk.c +++ b/src/trunk.c @@ -2473,7 +2473,6 @@ trunk_branch_merger_build_merge_itor(trunk_branch_merger *merger, merger->max_key, greater_than_or_equal, merger->min_key, - TRUE, FALSE, merger->height, lookahead); diff --git a/tests/functional/btree_test.c b/tests/functional/btree_test.c index c465f150..2511e615 100644 --- a/tests/functional/btree_test.c +++ b/tests/functional/btree_test.c @@ -35,7 +35,7 @@ typedef struct btree_scan_perf_options { bool32 random_bounds; bool32 memtable_no_copy_nodes; bool32 memtable_copy_nodes; - uint32 prefetch_lookahead; // 0=no prefetch, 1=legacy single-extent, >=2 deep + uint32 prefetch_lookahead; // 0=off, 1=next extent, >=2 deep } btree_scan_perf_options; static const char * @@ -373,9 +373,6 @@ test_btree_scan_once(cache *cc, uint64 *tuples_scanned, uint64 *logical_bytes_scanned) { - // prefetch_lookahead 0 => no prefetch; 1 => legacy single-extent; >=2 => - // deep - bool32 do_prefetch = (prefetch_lookahead >= 1); btree_iterator itor; timestamp start_time = platform_get_timestamp(); platform_status rc = btree_iterator_init(cc, @@ -389,7 +386,6 @@ test_btree_scan_once(cache *cc, max_key, greater_than_or_equal, min_key, - do_prefetch, copy_nodes, 0, prefetch_lookahead); @@ -690,9 +686,8 @@ test_btree_scan_perf(cache *cc, greater_than_or_equal, NEGATIVE_INFINITY_KEY, FALSE, - FALSE, 0, - 1); + 0); if (!SUCCESS(rc)) { goto out; } @@ -1088,9 +1083,8 @@ test_btree_basic(cache *cc, greater_than_or_equal, NEGATIVE_INFINITY_KEY, FALSE, - FALSE, 0, - 1); + 0); platform_assert_status_ok(rc); platform_default_log("btree iterator init time %luns\n", platform_timestamp_elapsed(start_time)); @@ -1274,9 +1268,8 @@ test_btree_create_packed_trees(cache *cc, greater_than_or_equal, NEGATIVE_INFINITY_KEY, FALSE, - FALSE, 0, - 1); + 0); platform_assert_status_ok(rc); btree_pack_req req; @@ -1332,7 +1325,6 @@ test_count_tuples_in_range(cache *cc, high_key, greater_than_or_equal, low_key, - TRUE, FALSE, 0, 1); @@ -1432,7 +1424,6 @@ test_btree_print_all_keys(cache *cc, high_key, greater_than_or_equal, low_key, - TRUE, FALSE, 0, 1); @@ -1512,7 +1503,6 @@ test_btree_merge_basic(cache *cc, hi, greater_than_or_equal, lo, - TRUE, FALSE, 0, 1); @@ -1737,7 +1727,6 @@ test_btree_rough_iterator(cache *cc, greater_than_or_equal, NEGATIVE_INFINITY_KEY, TRUE, - TRUE, 1, 1); platform_assert_status_ok(rc); @@ -1903,7 +1892,6 @@ test_btree_merge_perf(cache *cc, max_key, greater_than_or_equal, min_key, - TRUE, FALSE, 0, 1); @@ -2113,7 +2101,7 @@ usage(const char *argv0) platform_error_log("\t--memtable-scan-mode choose which memtable " "iterator mode(s) to benchmark (default both)\n"); platform_error_log("\t--prefetch-lookahead extents to prefetch ahead " - "(0=off, 1=legacy single-extent, >=2 deep; default 0)\n"); + "(0=off, 1=next extent, >=2 deep; default 0)\n"); config_usage(); } diff --git a/tests/unit/btree_stress_test.c b/tests/unit/btree_stress_test.c index 0ff2224f..96dcfbf5 100644 --- a/tests/unit/btree_stress_test.c +++ b/tests/unit/btree_stress_test.c @@ -708,9 +708,8 @@ iterator_tests(cache *cc, greater_than_or_equal, start_key, FALSE, - FALSE, 0, - 1); + 0); ASSERT_TRUE(SUCCESS(rc)); iterator *iter = (iterator *)&dbiter; @@ -762,9 +761,8 @@ iterator_seek_tests(cache *cc, greater_than_or_equal, start_key, FALSE, - FALSE, 0, - 1); + 0); ASSERT_TRUE(SUCCESS(rc)); iterator *iter = (iterator *)&dbiter; @@ -811,9 +809,8 @@ pack_tests(cache *cc, greater_than_or_equal, NEGATIVE_INFINITY_KEY, FALSE, - FALSE, 0, - 1); + 0); ASSERT_TRUE(SUCCESS(rc)); rc = STATUS_TEST_FAILED; diff --git a/tests/unit/btree_test.c b/tests/unit/btree_test.c index 064932d1..e811728c 100644 --- a/tests/unit/btree_test.c +++ b/tests/unit/btree_test.c @@ -188,10 +188,9 @@ leaf_hdr_tests(btree_config *cfg, btree_scratch *scratch, platform_heap_id hid) * about this number. If you change the size of a btree leaf header * or the size of a btree leafy entry, then this number will need * to be changed, and that's fine. - * (Reduced from 208 to 207 when btree_hdr gained the 8-byte - * meta_page_addr field used by the iterator prefetch cursor.) + * (The header currently has room for 208 such entries.) */ - int nkvs = 207; + int nkvs = 208; btree_init_hdr(cfg, hdr); From 8b328054239c8ea884109ef4c85ff8f278198c1c Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Fri, 26 Jun 2026 12:56:23 -0700 Subject: [PATCH 09/15] formatting Signed-off-by: Rob Johnson --- src/btree.c | 59 ++++++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/src/btree.c b/src/btree.c index 01f48c33..d2f5b8e3 100644 --- a/src/btree.c +++ b/src/btree.c @@ -2668,8 +2668,8 @@ btree_iterator_curr_live_prev_addr(btree_iterator *itor) btree_node live_curr; live_curr.addr = itor->curr.addr; btree_node_get(itor->cc, itor->cfg, &live_curr, itor->page_type); - uint64 prev_addr = - btree_hdr_prev_addr(itor->cfg, live_curr.hdr, live_curr.addr, itor->page_type); + uint64 prev_addr = btree_hdr_prev_addr( + itor->cfg, live_curr.hdr, live_curr.addr, itor->page_type); btree_node_unget(itor->cc, itor->cfg, &live_curr); return prev_addr; } @@ -2694,10 +2694,8 @@ btree_iterator_end_key_beyond_curr(btree_iterator *itor) } if (num_entries == 0 || itor->height > btree_height(itor->curr.hdr)) { return num_entries == 0 - && btree_hdr_next_addr(itor->cfg, - itor->curr.hdr, - itor->curr.addr, - itor->page_type) + && btree_hdr_next_addr( + itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) != 0; } @@ -3045,16 +3043,16 @@ btree_prefetch_cursor_deinit(btree_iterator *itor) * On a direction change, restarts the cursor in the new direction (resetting * the ramp). Otherwise pumps the non-blocking prime while PRIMING. Deep * prefetch refills after an extent-boundary crossing; header-link prefetch runs - * after a crossing or direction restart, when the adjacent extent link is useful - * and not just a duplicate prefetch for the current extent. + * after a crossing or direction restart, when the adjacent extent link is + * useful and not just a duplicate prefetch for the current extent. */ static void btree_iterator_prefetch_on_advance(btree_iterator *itor, uint64 last_addr, bool32 going_forward) { - cache *cc = itor->cc; - btree_prefetch_cursor *pf = &itor->prefetch; + cache *cc = itor->cc; + btree_prefetch_cursor *pf = &itor->prefetch; bool32 restarted = FALSE; // Direction change: restart cursor in the new direction, resetting ramp. @@ -3069,7 +3067,8 @@ btree_iterator_prefetch_on_advance(btree_iterator *itor, if (pf->state == BTREE_PREFETCH_PRIMING) { positioned_now = btree_prefetch_cursor_pump(itor); } - bool32 crossed_extent = !btree_addrs_share_extent(cc, last_addr, itor->curr.addr); + bool32 crossed_extent = + !btree_addrs_share_extent(cc, last_addr, itor->curr.addr); if (pf->state == BTREE_PREFETCH_ACTIVE) { if (crossed_extent && !positioned_now && !restarted) { btree_prefetch_cursor_on_boundary(itor); @@ -3086,7 +3085,8 @@ btree_iterator_prefetch_on_advance(btree_iterator *itor, if (extent_addr != 0 && (!going_forward - || !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr))) + || !btree_addrs_share_extent( + cc, itor->curr.addr, itor->end_addr))) { cache_prefetch(cc, extent_addr, itor->page_type); } @@ -3103,8 +3103,8 @@ static void btree_iterator_next_leaf(btree_iterator *itor) { uint64 last_addr = itor->curr.addr; - uint64 next_addr = - btree_hdr_next_addr(itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type); + uint64 next_addr = btree_hdr_next_addr( + itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type); btree_iterator_release_curr(itor); btree_iterator_get_curr_addr(itor, next_addr); itor->idx = 0; @@ -3172,12 +3172,12 @@ btree_iterator_prev_leaf(btree_iterator *itor) * old curr node and the new one. In this case, we can just walk * forward until we find the leaf whose successor is our old leaf. */ - while (btree_hdr_next_addr( - cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) - != last_addr) + while ( + btree_hdr_next_addr(cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) + != last_addr) { - uint64 next_addr = - btree_hdr_next_addr(cfg, itor->curr.hdr, itor->curr.addr, itor->page_type); + uint64 next_addr = btree_hdr_next_addr( + cfg, itor->curr.hdr, itor->curr.addr, itor->page_type); btree_iterator_release_curr(itor); btree_iterator_get_curr_addr(itor, next_addr); } @@ -3193,7 +3193,8 @@ btree_iterator_prev_leaf(btree_iterator *itor) itor->curr_min_idx = find_key_in_node( itor, itor->curr.hdr, itor->min_key, itor->min_key_comparison, NULL); } - if (btree_hdr_prev_addr(cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) + if (btree_hdr_prev_addr( + cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) == 0 && itor->curr_min_idx == -1) { @@ -3868,9 +3869,7 @@ btree_iterator_deinit(btree_iterator *itor) // Branch nodes use the header union for meta_page_addr. static inline void -btree_pack_node_init_hdr(const btree_config *cfg, - btree_hdr *hdr, - uint8 height) +btree_pack_node_init_hdr(const btree_config *cfg, btree_hdr *hdr, uint8 height) { btree_init_hdr(cfg, hdr); hdr->height = height; @@ -3920,9 +3919,7 @@ btree_pack_create_next_node(btree_pack_req *req, uint64 height, key pivot); * Add the specified node to its parent. Creates a parent if necessary. */ static inline void -btree_pack_link_node(btree_pack_req *req, - uint64 height, - uint64 offset) +btree_pack_link_node(btree_pack_req *req, uint64 height, uint64 offset) { btree_node *edge = &req->edge[height][offset]; btree_pivot_stats *edge_stats = &req->edge_stats[height][offset]; @@ -3997,8 +3994,8 @@ btree_pack_create_next_node(btree_pack_req *req, uint64 height, key pivot) } if (0 < req->num_edges[height]) { - btree_node *old_node = btree_pack_get_current_node(req, height); - new_node.hdr->prev_addr = old_node->hdr->prev_addr; + btree_node *old_node = btree_pack_get_current_node(req, height); + new_node.hdr->prev_addr = old_node->hdr->prev_addr; if (!btree_addrs_share_extent(req->cc, old_node->addr, new_node.addr)) { debug_assert(btree_page_is_last_in_extent(req->cfg, old_node->addr)); debug_assert(btree_page_is_first_in_extent(req->cfg, new_node.addr)); @@ -4382,7 +4379,8 @@ btree_print_index_node(platform_log_handle *log_handle, btree_hdr_next_addr(cfg, hdr, addr, type)); if (type == PAGE_TYPE_BRANCH) { platform_log(log_handle, "** flags: %u \n", hdr->flags); - platform_log(log_handle, "** meta_page_addr: %lu \n", hdr->meta_page_addr); + platform_log( + log_handle, "** meta_page_addr: %lu \n", hdr->meta_page_addr); platform_log(log_handle, "** prev_extent_addr: %lu \n", btree_hdr_prev_extent_addr(cfg, hdr, addr, type)); @@ -4443,7 +4441,8 @@ btree_print_leaf_node(platform_log_handle *log_handle, btree_hdr_next_addr(cfg, hdr, addr, type)); if (type == PAGE_TYPE_BRANCH) { platform_log(log_handle, "** flags: %u \n", hdr->flags); - platform_log(log_handle, "** meta_page_addr: %lu \n", hdr->meta_page_addr); + platform_log( + log_handle, "** meta_page_addr: %lu \n", hdr->meta_page_addr); platform_log(log_handle, "** prev_extent_addr: %lu \n", btree_hdr_prev_extent_addr(cfg, hdr, addr, type)); From 75312335bc42a372ccf1547a0a7fcc7947d5e9bb Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Fri, 26 Jun 2026 13:22:21 -0700 Subject: [PATCH 10/15] eliminate num_tuples from core_range_iterator_init Signed-off-by: Rob Johnson --- src/core.c | 42 +++++++-------------------- src/core.h | 4 +-- src/splinterdb.c | 3 +- tests/functional/test_functionality.c | 6 ++-- 4 files changed, 14 insertions(+), 41 deletions(-) diff --git a/src/core.c b/src/core.c index f1cf0ab1..b30c01cb 100644 --- a/src/core.c +++ b/src/core.c @@ -46,14 +46,6 @@ static const int64 latency_histo_buckets[LATENCYHISTO_SIZE] = { _Static_assert(CORE_NUM_MEMTABLES <= MAX_MEMTABLES, "CORE_NUM_MEMTABLES <= MAX_MEMTABLES"); -/* - * For a "small" range query, you don't want to prefetch pages. - * This is the minimal # of items requested before we turn ON prefetching. - * (Empirically established through past experiments, for small key-value - * pairs. So, _may_ be less efficient in general cases. Needs a revisit.) - */ -#define CORE_PREFETCH_MIN (16384) - /* Some randomly chosen Splinter super-block checksum seed. */ #define CORE_SUPER_CSUM_SEED (42) @@ -1005,8 +997,7 @@ core_range_iterator_init(core_handle *spl, comparison max_key_comparison, key max_key, comparison start_key_comparison, - key start_key, - uint64 num_tuples) + key start_key) { platform_status rc; @@ -1021,7 +1012,6 @@ core_range_iterator_init(core_handle *spl, range_itor->spl = spl; range_itor->super.ops = &core_range_iterator_ops; range_itor->num_branches = 0; - range_itor->num_tuples = num_tuples; range_itor->merge_itor = NULL; range_itor->can_prev = TRUE; range_itor->can_next = TRUE; @@ -1187,13 +1177,12 @@ core_range_iterator_init(core_handle *spl, return STATUS_NO_MEMORY; } - // Deep extent-prefetch for the scan: count the branches eligible to prefetch - // (compacted, and only when the scan is large enough to be worth it), then - // give each a soft share of the prefetch budget. + // Deep extent-prefetch for the scan: count compacted branches and give each + // a soft share of the prefetch budget. uint64 n_prefetch_branches = 0; for (uint64 branch_no = 0; branch_no < range_itor->num_branches; branch_no++) { - if (range_itor->compacted[branch_no] && num_tuples > CORE_PREFETCH_MIN) { + if (range_itor->compacted[branch_no]) { n_prefetch_branches++; } } @@ -1209,7 +1198,7 @@ core_range_iterator_init(core_handle *spl, uint64 branch_addr = range_itor->branch[branch_no].addr; page_type page_type = range_itor->branch[branch_no].type; uint32 prefetch_lookahead = 0; - if (range_itor->compacted[branch_no] && num_tuples > CORE_PREFETCH_MIN) { + if (range_itor->compacted[branch_no]) { prefetch_lookahead = deep_lookahead; } rc = core_start_btree_iterator_init_async( @@ -1276,7 +1265,6 @@ core_range_iterator_init(core_handle *spl, key_buffer local_max_buffer; rc = key_buffer_init_from_key( &local_max_buffer, PROCESS_PRIVATE_HEAP_ID, local_max); - uint64 num_tuples = range_itor->num_tuples; core_range_iterator_deinit(range_itor); if (!SUCCESS(rc)) { return rc; @@ -1289,8 +1277,7 @@ core_range_iterator_init(core_handle *spl, max_key_comparison, max_key, greater_than_or_equal, - local_max, - num_tuples); + local_max); key_buffer_deinit(&local_max_buffer); if (!SUCCESS(rc)) { return rc; @@ -1308,7 +1295,6 @@ core_range_iterator_init(core_handle *spl, key_buffer local_min_buffer; rc = key_buffer_init_from_key( &local_min_buffer, PROCESS_PRIVATE_HEAP_ID, local_min); - uint64 num_tuples = range_itor->num_tuples; core_range_iterator_deinit(range_itor); if (!SUCCESS(rc)) { return rc; @@ -1321,8 +1307,7 @@ core_range_iterator_init(core_handle *spl, max_key_comparison, max_key, less_than, - local_min, - num_tuples); + local_min); key_buffer_deinit(&local_min_buffer); if (!SUCCESS(rc)) { return rc; @@ -1356,7 +1341,6 @@ core_range_iterator_next(iterator *itor) if (!SUCCESS(rc)) { return rc; } - range_itor->num_tuples++; range_itor->can_prev = TRUE; range_itor->can_next = iterator_can_next(&range_itor->merge_itor->super); if (!range_itor->can_next) { @@ -1385,7 +1369,6 @@ core_range_iterator_next(iterator *itor) // if there is more data to get, rebuild the iterator for next leaf if (core_range_iterator_has_next_leaf(range_itor)) { core_handle *spl = range_itor->spl; - uint64 temp_tuples = range_itor->num_tuples; comparison min_key_comparison = range_itor->min_key_comparison; comparison max_key_comparison = range_itor->max_key_comparison; core_range_iterator_deinit(range_itor); @@ -1396,8 +1379,7 @@ core_range_iterator_next(iterator *itor) max_key_comparison, max_key, greater_than_or_equal, - local_max_key, - temp_tuples); + local_max_key); if (!SUCCESS(rc)) { return rc; } @@ -1420,7 +1402,6 @@ core_range_iterator_prev(iterator *itor) if (!SUCCESS(rc)) { return rc; } - range_itor->num_tuples++; range_itor->can_next = TRUE; range_itor->can_prev = iterator_can_prev(&range_itor->merge_itor->super); if (!range_itor->can_prev) { @@ -1449,7 +1430,6 @@ core_range_iterator_prev(iterator *itor) // if there is more data to get, rebuild the iterator for prev leaf if (core_key_compare(range_itor->spl, local_min_key, min_key) > 0) { core_handle *spl = range_itor->spl; - uint64 temp_tuples = range_itor->num_tuples; comparison min_key_comparison = range_itor->min_key_comparison; comparison max_key_comparison = range_itor->max_key_comparison; core_range_iterator_deinit(range_itor); @@ -1460,8 +1440,7 @@ core_range_iterator_prev(iterator *itor) max_key_comparison, max_key, less_than, - local_min_key, - temp_tuples); + local_min_key); if (!SUCCESS(rc)) { return rc; } @@ -1766,8 +1745,7 @@ core_apply_to_range(core_handle *spl, less_than, POSITIVE_INFINITY_KEY, greater_than_or_equal, - start_key, - num_tuples); + start_key); if (!SUCCESS(rc)) { platform_error_log("core_apply_to_range: range iterator init failed: " "%s\n", diff --git a/src/core.h b/src/core.h index 440a4829..94c10f97 100644 --- a/src/core.h +++ b/src/core.h @@ -127,7 +127,6 @@ struct core_handle { typedef struct core_range_iterator { iterator super; core_handle *spl; - uint64 num_tuples; uint64 num_branches; uint64 num_memtable_branches; uint64 memtable_start_gen; @@ -206,8 +205,7 @@ core_range_iterator_init(core_handle *spl, comparison max_key_comparison, key max_key, comparison start_key_comparison, - key start_key, - uint64 num_tuples); + key start_key); void core_range_iterator_deinit(core_range_iterator *range_itor); diff --git a/src/splinterdb.c b/src/splinterdb.c index 86868aa7..b8614668 100644 --- a/src/splinterdb.c +++ b/src/splinterdb.c @@ -881,8 +881,7 @@ splinterdb_iterator_init_with_bounds(splinterdb *kvs, // IN max_key_comparison, max_key, start_key_comparison, - start_key, - UINT64_MAX); + start_key); if (!SUCCESS(rc)) { merge_accumulator_deinit(&it->materialized_message); platform_free(kvs->spl.heap_id, it); diff --git a/tests/functional/test_functionality.c b/tests/functional/test_functionality.c index ab32d4b7..7cac864b 100644 --- a/tests/functional/test_functionality.c +++ b/tests/functional/test_functionality.c @@ -40,8 +40,7 @@ search_for_key_via_iterator(core_handle *spl, key target) less_than, POSITIVE_INFINITY_KEY, greater_than_or_equal, - NEGATIVE_INFINITY_KEY, - UINT64_MAX); + NEGATIVE_INFINITY_KEY); uint64 count = 0; while (iterator_can_curr((iterator *)&iter)) { key curr_key; @@ -248,8 +247,7 @@ verify_range_against_shadow(core_handle *spl, less_than, end_key, greater_than_or_equal, - start_key, - end_index - start_index); + start_key); if (!SUCCESS(status)) { platform_error_log("failed to create range itor: %s\n", platform_status_to_string(status)); From 53669a29aa5b1a4ef65cdbd78dc6681e31073106 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Fri, 26 Jun 2026 17:18:16 -0700 Subject: [PATCH 11/15] mini_meta_cursor cleanup Signed-off-by: Rob Johnson --- src/btree.c | 30 ++--- src/clockcache.c | 269 ++++++++++++++++--------------------------- src/mini_allocator.c | 187 ++++++++++++++++-------------- src/mini_allocator.h | 51 ++++---- 4 files changed, 231 insertions(+), 306 deletions(-) diff --git a/src/btree.c b/src/btree.c index d2f5b8e3..6e36a3c5 100644 --- a/src/btree.c +++ b/src/btree.c @@ -2876,10 +2876,13 @@ btree_prefetch_cursor_step(btree_prefetch_cursor *pf, uint64 *extent_addr, uint64 *batch) { - if (pf->going_forward) { - return mini_meta_cursor_next(&pf->meta_cursor, extent_addr, batch); + mini_meta_cursor_status status = + pf->going_forward ? mini_meta_cursor_next(&pf->meta_cursor) + : mini_meta_cursor_prev(&pf->meta_cursor); + if (status == MINI_META_CURSOR_ENTRY) { + mini_meta_cursor_curr(&pf->meta_cursor, extent_addr, batch); } - return mini_meta_cursor_prev(&pf->meta_cursor, extent_addr, batch); + return status; } /* @@ -2922,12 +2925,12 @@ btree_prefetch_cursor_fill(btree_iterator *itor) } /* - * Try to position the (PRIMING) cursor at the iterator's current leaf extent. + * Try to position the (PRIMING) cursor on the iterator's current leaf extent. * Non-blocking: kicks off meta-page IO and leaves the cursor PRIMING when the * page is not resident yet. Reads the current leaf's meta_page_addr every call, - * so it positions correctly even if the iterator advanced while priming. For - * backward scans, consumes the current extent after seeking so fill starts with - * the previous extent. Returns TRUE iff the cursor just became ACTIVE. + * so it positions correctly even if the iterator advanced while priming. Fill + * starts by moving in the scan direction, so the current extent is not + * prefetched. Returns TRUE iff the cursor just became ACTIVE. */ static bool32 btree_prefetch_cursor_pump(btree_iterator *itor) @@ -2942,11 +2945,9 @@ btree_prefetch_cursor_pump(btree_iterator *itor) } mini_meta_cursor_deinit(&pf->meta_cursor); - mini_meta_cursor_init( - &pf->meta_cursor, itor->cc, itor->page_type, meta_page_addr); uint64 cur_extent = btree_extent_base_addr(itor->cc, itor->curr.addr); - mini_meta_cursor_status status = - mini_meta_cursor_seek_extent(&pf->meta_cursor, cur_extent); + mini_meta_cursor_status status = mini_meta_cursor_init( + &pf->meta_cursor, itor->cc, itor->page_type, meta_page_addr, cur_extent); if (status == MINI_META_CURSOR_WOULD_BLOCK) { return FALSE; } @@ -2957,13 +2958,6 @@ btree_prefetch_cursor_pump(btree_iterator *itor) return FALSE; } - if (!pf->going_forward) { - uint64 ignored_addr, ignored_batch; - status = - mini_meta_cursor_prev(&pf->meta_cursor, &ignored_addr, &ignored_batch); - platform_assert(status == MINI_META_CURSOR_ENTRY); - } - pf->state = BTREE_PREFETCH_ACTIVE; pf->at_end = FALSE; pf->prefetched_ahead = 0; diff --git a/src/clockcache.c b/src/clockcache.c index 4473c781..2fe92b3e 100644 --- a/src/clockcache.c +++ b/src/clockcache.c @@ -1804,10 +1804,7 @@ clockcache_get_internal(clockcache *cc, // IN page_handle * clockcache_get(clockcache *cc, uint64 addr, bool32 blocking, page_type type) { - bool32 retry; - // Initialize to NULL so a non-blocking get of a page that is not in cache - // (clockcache_get_internal returns without setting handle) honors the - // documented contract of returning NULL rather than an uninitialized value. + bool32 retry; page_handle *handle = NULL; debug_assert(cc->per_thread[platform_get_tid()].enable_sync_get @@ -2520,25 +2517,59 @@ clockcache_prefetch_callback(void *pfs) platform_free(PROCESS_PRIVATE_HEAP_ID, state); } -/* - *----------------------------------------------------------------------------- - * clockcache_prefetch -- - * - * prefetch asynchronously loads the extent with given base address - *----------------------------------------------------------------------------- - */ -void -clockcache_prefetch(clockcache *cc, uint64 base_addr, page_type type) +static void +clockcache_prefetch_issue(clockcache *cc, + async_io_state **state, + uint64 *state_num_pages, + page_type type) +{ + platform_assert(cc != NULL); + platform_assert(state != NULL); + platform_assert(*state != NULL); + platform_assert(state_num_pages != NULL); + platform_assert(*state_num_pages > 0); + platform_assert(PAGE_TYPE_FIRST <= type && type < NUM_PAGE_TYPES); + + if (cc->cfg->use_stats) { + threadid tid = platform_get_tid(); + cc->stats[tid].page_reads[type] += *state_num_pages; + cc->stats[tid].prefetches_issued[type]++; + } + + io_async_run((*state)->iostate); + *state = NULL; + *state_num_pages = 0; +} + +static void +clockcache_prefetch_discard_empty(async_io_state **state) { - async_io_state *state = NULL; - uint64 state_num_pages = 0; - uint64 pages_per_extent = cc->cfg->pages_per_extent; - threadid tid = platform_get_tid(); + platform_assert(state != NULL); + platform_assert(*state != NULL); - debug_assert(base_addr % clockcache_extent_size(cc) == 0); + io_async_state_deinit((*state)->iostate); + platform_free(PROCESS_PRIVATE_HEAP_ID, *state); + *state = NULL; +} + +static void +clockcache_prefetch_pages(clockcache *cc, + uint64 first_addr, + uint64 num_pages, + page_type type) +{ + async_io_state *state = NULL; + uint64 state_num_pages = 0; + threadid tid = platform_get_tid(); - for (uint64 page_off = 0; page_off < pages_per_extent; page_off++) { - uint64 addr = base_addr + clockcache_multiply_by_page_size(cc, page_off); + platform_assert(cc != NULL); + platform_assert(PAGE_TYPE_FIRST <= type && type < NUM_PAGE_TYPES); + platform_assert(num_pages > 0); + platform_assert(num_pages <= cc->cfg->pages_per_extent); + platform_assert(first_addr % clockcache_page_size(cc) == 0); + + for (uint64 page_off = 0; page_off < num_pages; page_off++) { + uint64 addr = first_addr + clockcache_multiply_by_page_size(cc, page_off); uint32 entry_no = clockcache_lookup(cc, addr); get_rc get_read_rc; if (entry_no != CC_UNMAPPED_ENTRY) { @@ -2552,42 +2583,34 @@ clockcache_prefetch(clockcache *cc, uint64 base_addr, page_type type) clockcache_dec_ref(cc, entry_no, tid); // fallthrough case GET_RC_CONFLICT: - // in cache, issue IO req if pages have been queued if (state != NULL) { - platform_assert(state_num_pages > 0); - if (cc->cfg->use_stats) { - threadid tid = platform_get_tid(); - cc->stats[tid].page_reads[type] += state_num_pages; - cc->stats[tid].prefetches_issued[type]++; - } - io_async_run(state->iostate); - state = NULL; - state_num_pages = 0; + clockcache_prefetch_issue(cc, &state, &state_num_pages, type); } clockcache_log(addr, entry_no, - "prefetch (cached): entry %u addr %lu\n", + "prefetch_pages (cached): entry %u addr %lu\n", entry_no, addr); break; case GET_RC_EVICTED: { - // need to prefetch uint32 free_entry_no = clockcache_get_free_page( cc, CC_READ_LOADING_STATUS, type, FALSE, TRUE); clockcache_entry *entry = &cc->entry[free_entry_no]; entry->page.disk_addr = addr; entry->type = type; uint64 lookup_no = clockcache_divide_by_page_size(cc, addr); + if (state == NULL) { - // start a new IO req before publishing the loading entry state = TYPED_MALLOC(PROCESS_PRIVATE_HEAP_ID, state); if (state == NULL) { - platform_error_log("clockcache_prefetch: async_io_state " - "allocation failed for base addr %lu, " - "page addr %lu, type %u\n", - base_addr, + platform_error_log("clockcache_prefetch_pages: " + "async_io_state allocation failed for " + "first addr %lu, page addr %lu, " + "num pages %lu, type %u\n", + first_addr, addr, + num_pages, type); clockcache_release_unpublished_entry(entry); return; @@ -2601,11 +2624,13 @@ clockcache_prefetch(clockcache *cc, uint64 base_addr, page_type type) clockcache_prefetch_callback, state); if (!SUCCESS(rc)) { - platform_error_log("clockcache_prefetch: " - "io_async_state_init failed for base addr " - "%lu, page addr %lu, type %u: %s\n", - base_addr, + platform_error_log("clockcache_prefetch_pages: " + "io_async_state_init failed for first " + "addr %lu, page addr %lu, num pages %lu, " + "type %u: %s\n", + first_addr, addr, + num_pages, type, platform_status_to_string(rc)); clockcache_release_unpublished_entry(entry); @@ -2614,41 +2639,38 @@ clockcache_prefetch(clockcache *cc, uint64 base_addr, page_type type) return; } } + if (__sync_bool_compare_and_swap( &cc->lookup[lookup_no], CC_UNMAPPED_ENTRY, free_entry_no)) { platform_status rc = io_async_state_append_page(state->iostate, entry->page.data); if (!SUCCESS(rc)) { - platform_error_log("clockcache_prefetch: " + platform_error_log("clockcache_prefetch_pages: " "io_async_state_append_page failed for " - "base addr %lu, page addr %lu, entry %u, " - "type %u: %s\n", - base_addr, + "first addr %lu, page addr %lu, " + "entry %u, num pages %lu, type %u: %s\n", + first_addr, addr, free_entry_no, + num_pages, type, platform_status_to_string(rc)); } platform_assert_status_ok(rc); state_num_pages++; clockcache_log(addr, - entry_no, - "prefetch (load): entry %u addr %lu\n", - entry_no, + free_entry_no, + "prefetch_pages (load): entry %u addr %lu\n", + free_entry_no, addr); } else { - /* - * someone else is already loading this page, release the free - * entry and retry - */ clockcache_release_unpublished_entry(entry); - if (state_num_pages == 0) { - io_async_state_deinit(state->iostate); - platform_free(PROCESS_PRIVATE_HEAP_ID, state); - state = NULL; + if (state_num_pages > 0) { + clockcache_prefetch_issue(cc, &state, &state_num_pages, type); + } else { + clockcache_prefetch_discard_empty(&state); } - page_off--; } break; } @@ -2656,134 +2678,39 @@ clockcache_prefetch(clockcache *cc, uint64 base_addr, page_type type) platform_assert(0); } } - // issue IO req if pages have been queued + if (state != NULL) { - platform_assert(state_num_pages > 0); - if (cc->cfg->use_stats) { - threadid tid = platform_get_tid(); - cc->stats[tid].page_reads[type] += state_num_pages; - cc->stats[tid].prefetches_issued[type]++; - } - io_async_run(state->iostate); - state = NULL; - state_num_pages = 0; + clockcache_prefetch_issue(cc, &state, &state_num_pages, type); } } /* *----------------------------------------------------------------------------- - * clockcache_prefetch_page -- + * clockcache_prefetch -- * - * Like clockcache_prefetch, but loads only the single page at addr instead - * of its whole extent. Used for sparse reads (e.g. a mini_allocator meta - * page) where dragging in the surrounding extent would waste bandwidth. + * Prefetch asynchronously loads the extent with given base address. *----------------------------------------------------------------------------- */ void -clockcache_prefetch_page(clockcache *cc, uint64 addr, page_type type) +clockcache_prefetch(clockcache *cc, uint64 base_addr, page_type type) { - threadid tid = platform_get_tid(); - platform_assert(cc != NULL); - platform_assert(PAGE_TYPE_FIRST <= type && type < NUM_PAGE_TYPES); - platform_assert(addr % clockcache_page_size(cc) == 0); - - while (TRUE) { - uint32 entry_no = clockcache_lookup(cc, addr); - get_rc get_read_rc; - if (entry_no != CC_UNMAPPED_ENTRY) { - get_read_rc = clockcache_try_get_read(cc, entry_no, TRUE); - } else { - get_read_rc = GET_RC_EVICTED; - } - - switch (get_read_rc) { - case GET_RC_SUCCESS: - // already resident: drop the ref we just took and we're done. - clockcache_dec_ref(cc, entry_no, tid); - return; - case GET_RC_CONFLICT: - // someone else is loading or has it locked: nothing to do. - return; - case GET_RC_EVICTED: - { - // not in cache: load just this page. - uint32 free_entry_no = clockcache_get_free_page( - cc, CC_READ_LOADING_STATUS, type, FALSE, TRUE); - clockcache_entry *entry = &cc->entry[free_entry_no]; - entry->page.disk_addr = addr; - entry->type = type; - uint64 lookup_no = clockcache_divide_by_page_size(cc, addr); + platform_assert(base_addr % clockcache_extent_size(cc) == 0); - async_io_state *state = - TYPED_MALLOC(PROCESS_PRIVATE_HEAP_ID, state); - if (state == NULL) { - platform_error_log("clockcache_prefetch_page: async_io_state " - "allocation failed for page addr %lu, " - "type %u\n", - addr, - type); - clockcache_release_unpublished_entry(entry); - return; - } - state->cc = cc; - platform_status rc = - io_async_state_init(state->iostate, - cc->io, - io_async_preadv, - addr, - clockcache_prefetch_callback, - state); - if (!SUCCESS(rc)) { - platform_error_log("clockcache_prefetch_page: " - "io_async_state_init failed for page addr " - "%lu, type %u: %s\n", - addr, - type, - platform_status_to_string(rc)); - clockcache_release_unpublished_entry(entry); - platform_free(PROCESS_PRIVATE_HEAP_ID, state); - return; - } + clockcache_prefetch_pages(cc, base_addr, cc->cfg->pages_per_extent, type); +} - if (__sync_bool_compare_and_swap( - &cc->lookup[lookup_no], CC_UNMAPPED_ENTRY, free_entry_no)) - { - rc = - io_async_state_append_page(state->iostate, entry->page.data); - if (!SUCCESS(rc)) { - platform_error_log("clockcache_prefetch_page: " - "io_async_state_append_page failed for " - "page addr %lu, entry %u, type %u: %s\n", - addr, - free_entry_no, - type, - platform_status_to_string(rc)); - } - platform_assert_status_ok(rc); - if (cc->cfg->use_stats) { - cc->stats[tid].page_reads[type]++; - cc->stats[tid].prefetches_issued[type]++; - } - clockcache_log(addr, - free_entry_no, - "prefetch_page (load): entry %u addr %lu\n", - free_entry_no, - addr); - io_async_run(state->iostate); - return; - } else { - // someone else started loading this page: release and retry. - clockcache_release_unpublished_entry(entry); - io_async_state_deinit(state->iostate); - platform_free(PROCESS_PRIVATE_HEAP_ID, state); - continue; - } - } - default: - platform_assert(0); - } - } +/* + *----------------------------------------------------------------------------- + * clockcache_prefetch_page -- + * + * Prefetch asynchronously loads the single page at addr. + *----------------------------------------------------------------------------- + */ +void +clockcache_prefetch_page(clockcache *cc, uint64 addr, page_type type) +{ + clockcache_prefetch_pages(cc, addr, 1, type); } /* diff --git a/src/mini_allocator.c b/src/mini_allocator.c index 4984365c..75684e0a 100644 --- a/src/mini_allocator.c +++ b/src/mini_allocator.c @@ -922,16 +922,19 @@ mini_prefetch(cache *cc, page_type type, uint64 meta_head) * mini_meta_cursor -- cursor over a mini_allocator's extent entries. *----------------------------------------------------------------------------- */ -void +mini_meta_cursor_status mini_meta_cursor_init(mini_meta_cursor *cursor, cache *cc, page_type meta_type, - uint64 meta_addr) + uint64 meta_addr, + uint64 target_extent_addr) { platform_assert(cursor != NULL); platform_assert(cc != NULL); platform_assert(PAGE_TYPE_FIRST <= meta_type && meta_type < NUM_PAGE_TYPES); platform_assert(meta_addr != 0); + platform_assert(target_extent_addr != 0); + platform_assert(target_extent_addr % cache_extent_size(cc) == 0); cursor->cc = cc; cursor->meta_type = meta_type; @@ -939,6 +942,36 @@ mini_meta_cursor_init(mini_meta_cursor *cursor, cursor->meta_addr = meta_addr; cursor->entry_idx = 0; cursor->num_entries = 0; + + while (TRUE) { + if (cursor->meta_addr == 0) { + return MINI_META_CURSOR_END; + } + + cursor->meta_page = + cache_get(cursor->cc, cursor->meta_addr, FALSE, cursor->meta_type); + if (cursor->meta_page == NULL) { + cache_prefetch_page(cursor->cc, cursor->meta_addr, cursor->meta_type); + return MINI_META_CURSOR_WOULD_BLOCK; + } + + cursor->num_entries = mini_num_entries(cursor->meta_page); + cursor->entry_idx = 0; + while (cursor->entry_idx < cursor->num_entries) { + meta_entry *entry = first_entry(cursor->meta_page) + cursor->entry_idx; + if (meta_entry_extent_addr(cursor->cc, entry) == target_extent_addr) { + return MINI_META_CURSOR_ENTRY; + } + cursor->entry_idx++; + } + + uint64 next_meta_addr = mini_get_next_meta_addr(cursor->meta_page); + cache_unget(cursor->cc, cursor->meta_page); + cursor->meta_page = NULL; + cursor->meta_addr = next_meta_addr; + cursor->entry_idx = 0; + cursor->num_entries = 0; + } } void @@ -950,10 +983,13 @@ mini_meta_cursor_deinit(mini_meta_cursor *cursor) cache_unget(cursor->cc, cursor->meta_page); cursor->meta_page = NULL; } + cursor->meta_addr = 0; + cursor->entry_idx = 0; + cursor->num_entries = 0; } -mini_meta_cursor_status -mini_meta_cursor_next(mini_meta_cursor *cursor, +void +mini_meta_cursor_curr(mini_meta_cursor *cursor, uint64 *extent_addr, uint64 *batch) { @@ -963,112 +999,85 @@ mini_meta_cursor_next(mini_meta_cursor *cursor, platform_assert(batch != NULL); platform_assert(PAGE_TYPE_FIRST <= cursor->meta_type && cursor->meta_type < NUM_PAGE_TYPES); + platform_assert(cursor->meta_page != NULL); + platform_assert(cursor->entry_idx < cursor->num_entries); - while (TRUE) { - if (cursor->meta_page == NULL) { - if (cursor->meta_addr == 0) { - return MINI_META_CURSOR_END; - } - // Non-blocking: if the meta page isn't resident, kick off a - // single-page prefetch and let the caller retry later. - cursor->meta_page = - cache_get(cursor->cc, cursor->meta_addr, FALSE, cursor->meta_type); - if (cursor->meta_page == NULL) { - cache_prefetch_page( - cursor->cc, cursor->meta_addr, cursor->meta_type); - return MINI_META_CURSOR_WOULD_BLOCK; - } - cursor->num_entries = mini_num_entries(cursor->meta_page); - cursor->entry_idx = 0; - } - - if (cursor->entry_idx < cursor->num_entries) { - meta_entry *entry = first_entry(cursor->meta_page) + cursor->entry_idx; - *extent_addr = meta_entry_extent_addr(cursor->cc, entry); - *batch = meta_entry_batch(entry); - cursor->entry_idx++; - return MINI_META_CURSOR_ENTRY; - } - - // Exhausted this page; advance to the next one (if any). - uint64 next_meta_addr = mini_get_next_meta_addr(cursor->meta_page); - cache_unget(cursor->cc, cursor->meta_page); - cursor->meta_page = NULL; - cursor->meta_addr = next_meta_addr; - } + meta_entry *entry = first_entry(cursor->meta_page) + cursor->entry_idx; + *extent_addr = meta_entry_extent_addr(cursor->cc, entry); + *batch = meta_entry_batch(entry); } mini_meta_cursor_status -mini_meta_cursor_seek_extent(mini_meta_cursor *cursor, - uint64 target_extent_addr) +mini_meta_cursor_next(mini_meta_cursor *cursor) { platform_assert(cursor != NULL); platform_assert(cursor->cc != NULL); - platform_assert(target_extent_addr != 0); - platform_assert(target_extent_addr % cache_extent_size(cursor->cc) == 0); + platform_assert(PAGE_TYPE_FIRST <= cursor->meta_type + && cursor->meta_type < NUM_PAGE_TYPES); + platform_assert(cursor->meta_page != NULL); + platform_assert(cursor->entry_idx < cursor->num_entries); - uint64 extent_addr; - uint64 batch; - while (TRUE) { - mini_meta_cursor_status status = - mini_meta_cursor_next(cursor, &extent_addr, &batch); - if (status != MINI_META_CURSOR_ENTRY) { - return status; // END or WOULD_BLOCK - } - if (extent_addr == target_extent_addr) { - return MINI_META_CURSOR_ENTRY; - } + if (cursor->entry_idx + 1 < cursor->num_entries) { + cursor->entry_idx++; + return MINI_META_CURSOR_ENTRY; } + + uint64 next_meta_addr = mini_get_next_meta_addr(cursor->meta_page); + if (next_meta_addr == 0) { + return MINI_META_CURSOR_END; + } + + page_handle *next_page = + cache_get(cursor->cc, next_meta_addr, FALSE, cursor->meta_type); + if (next_page == NULL) { + cache_prefetch_page(cursor->cc, next_meta_addr, cursor->meta_type); + return MINI_META_CURSOR_WOULD_BLOCK; + } + + cache_unget(cursor->cc, cursor->meta_page); + cursor->meta_page = next_page; + cursor->meta_addr = next_meta_addr; + cursor->num_entries = mini_num_entries(cursor->meta_page); + platform_assert(cursor->num_entries > 0); + cursor->entry_idx = 0; + return MINI_META_CURSOR_ENTRY; } mini_meta_cursor_status -mini_meta_cursor_prev(mini_meta_cursor *cursor, - uint64 *extent_addr, - uint64 *batch) +mini_meta_cursor_prev(mini_meta_cursor *cursor) { platform_assert(cursor != NULL); platform_assert(cursor->cc != NULL); - platform_assert(extent_addr != NULL); - platform_assert(batch != NULL); platform_assert(PAGE_TYPE_FIRST <= cursor->meta_type && cursor->meta_type < NUM_PAGE_TYPES); + platform_assert(cursor->meta_page != NULL); + platform_assert(cursor->entry_idx < cursor->num_entries); - while (TRUE) { - if (cursor->meta_page == NULL) { - return MINI_META_CURSOR_END; - } - - if (cursor->entry_idx > 0) { - cursor->entry_idx--; - meta_entry *entry = first_entry(cursor->meta_page) + cursor->entry_idx; - *extent_addr = meta_entry_extent_addr(cursor->cc, entry); - *batch = meta_entry_batch(entry); - return MINI_META_CURSOR_ENTRY; - } - - // entry_idx == 0: exhausted this page going backward. - mini_meta_hdr *hdr = (mini_meta_hdr *)cursor->meta_page->data; - uint64 prev_addr = hdr->prev_meta_addr; - if (prev_addr == 0) { - return MINI_META_CURSOR_END; - } + if (cursor->entry_idx > 0) { + cursor->entry_idx--; + return MINI_META_CURSOR_ENTRY; + } - // Non-blocking: keep the current page alive so prev_meta_addr remains - // accessible on a WOULD_BLOCK retry — do NOT release before the load. - page_handle *prev_page = - cache_get(cursor->cc, prev_addr, FALSE, cursor->meta_type); - if (prev_page == NULL) { - cache_prefetch_page(cursor->cc, prev_addr, cursor->meta_type); - return MINI_META_CURSOR_WOULD_BLOCK; - } + mini_meta_hdr *hdr = (mini_meta_hdr *)cursor->meta_page->data; + uint64 prev_addr = hdr->prev_meta_addr; + if (prev_addr == 0) { + return MINI_META_CURSOR_END; + } - cache_unget(cursor->cc, cursor->meta_page); - cursor->meta_page = prev_page; - cursor->meta_addr = prev_addr; - cursor->num_entries = mini_num_entries(cursor->meta_page); - cursor->entry_idx = cursor->num_entries; - // Loop: entry_idx == num_entries > 0, will decrement and read. + page_handle *prev_page = + cache_get(cursor->cc, prev_addr, FALSE, cursor->meta_type); + if (prev_page == NULL) { + cache_prefetch_page(cursor->cc, prev_addr, cursor->meta_type); + return MINI_META_CURSOR_WOULD_BLOCK; } + + cache_unget(cursor->cc, cursor->meta_page); + cursor->meta_page = prev_page; + cursor->meta_addr = prev_addr; + cursor->num_entries = mini_num_entries(cursor->meta_page); + platform_assert(cursor->num_entries > 0); + cursor->entry_idx = cursor->num_entries - 1; + return MINI_META_CURSOR_ENTRY; } static void diff --git a/src/mini_allocator.h b/src/mini_allocator.h index 4e35a5b8..8a05a32b 100644 --- a/src/mini_allocator.h +++ b/src/mini_allocator.h @@ -131,53 +131,48 @@ typedef struct mini_meta_cursor { page_type meta_type; page_handle *meta_page; // currently held meta page, or NULL uint64 meta_addr; // addr of meta_page, or the next page to load - uint64 entry_idx; // index of the next entry to read on meta_page + uint64 entry_idx; // index of the current entry on meta_page uint64 num_entries; // number of entries on meta_page } mini_meta_cursor; -// Result of a non-blocking cursor step. +// Result of a non-blocking cursor operation. typedef enum mini_meta_cursor_status { - MINI_META_CURSOR_ENTRY, // produced an entry - MINI_META_CURSOR_END, // stream exhausted - MINI_META_CURSOR_WOULD_BLOCK, // next meta page not resident (prefetch + MINI_META_CURSOR_ENTRY, // cursor is positioned on an entry + MINI_META_CURSOR_END, // stream exhausted, or requested entry absent + MINI_META_CURSOR_WOULD_BLOCK, // needed meta page not resident (prefetch // issued) } mini_meta_cursor_status; -void +// Initialize cursor on target_extent_addr. Non-blocking: returns +// MINI_META_CURSOR_WOULD_BLOCK (and issues a prefetch for it) if a needed meta +// page is not yet resident. On MINI_META_CURSOR_ENTRY, curr is valid. +mini_meta_cursor_status mini_meta_cursor_init(mini_meta_cursor *cursor, cache *cc, page_type meta_type, - uint64 meta_addr); + uint64 meta_addr, + uint64 target_extent_addr); void mini_meta_cursor_deinit(mini_meta_cursor *cursor); -// Emit the next extent entry (its extent address and originating batch) in -// allocation order. Non-blocking: returns MINI_META_CURSOR_WOULD_BLOCK (and -// issues a prefetch for it) if the next meta page is not yet resident. -mini_meta_cursor_status -mini_meta_cursor_next(mini_meta_cursor *cursor, +// Get the current extent entry. Requires a successful init, next, or prev. +void +mini_meta_cursor_curr(mini_meta_cursor *cursor, uint64 *extent_addr, uint64 *batch); -// Advance the cursor until it emits the entry for target_extent_addr, leaving -// the cursor positioned just after it. Returns MINI_META_CURSOR_ENTRY if found, -// MINI_META_CURSOR_END if the stream ends first, or -// MINI_META_CURSOR_WOULD_BLOCK if a needed meta page is not yet resident. +// Move to the next extent entry in allocation order. Non-blocking: returns +// MINI_META_CURSOR_WOULD_BLOCK (and issues a prefetch for it) if the next meta +// page is not yet resident. END and WOULD_BLOCK leave curr unchanged. mini_meta_cursor_status -mini_meta_cursor_seek_extent(mini_meta_cursor *cursor, - uint64 target_extent_addr); - -// Emit the previous extent entry (reverse allocation order). The cursor must -// have been positioned by mini_meta_cursor_seek_extent() or a prior call to -// mini_meta_cursor_prev() — calling on a freshly-initialized cursor returns -// END. Non-blocking: if the previous meta page isn't resident, issues a -// single-page prefetch and returns MINI_META_CURSOR_WOULD_BLOCK; the current -// page is kept alive so the retry can follow prev_meta_addr without re-reading. +mini_meta_cursor_next(mini_meta_cursor *cursor); + +// Move to the previous extent entry in allocation order. Non-blocking: returns +// MINI_META_CURSOR_WOULD_BLOCK (and issues a prefetch for it) if the previous +// meta page is not yet resident. END and WOULD_BLOCK leave curr unchanged. mini_meta_cursor_status -mini_meta_cursor_prev(mini_meta_cursor *cursor, - uint64 *extent_addr, - uint64 *batch); +mini_meta_cursor_prev(mini_meta_cursor *cursor); /* Return total bytes allocated by the mini_allocator, including space used by * the mini_allocator itself.*/ From d9122e0a4048ea89e857c7ba492143901d176869 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Fri, 26 Jun 2026 17:51:41 -0700 Subject: [PATCH 12/15] mini_meta_cursor cleanup Signed-off-by: Rob Johnson --- src/btree.c | 6 ++++-- src/mini_allocator.c | 45 +++++++++++++++++++------------------------- src/mini_allocator.h | 9 +++++---- 3 files changed, 28 insertions(+), 32 deletions(-) diff --git a/src/btree.c b/src/btree.c index 6e36a3c5..a9faf309 100644 --- a/src/btree.c +++ b/src/btree.c @@ -4104,8 +4104,10 @@ btree_pack_post_loop(btree_pack_req *req, key last_key) debug_assert(success); btree_node_lock(cc, cfg, &root); memmove(root.hdr, req->edge[req->height][0].hdr, btree_page_size(cfg)); - root.hdr->prev_addr = 0; - root.hdr->next_addr = 0; + // The root is allocated outside the mini allocator's extent stream. + root.hdr->meta_page_addr = 0; + root.hdr->prev_addr = 0; + root.hdr->next_addr = 0; btree_hdr_set_first_in_level(root.hdr); btree_hdr_set_last_in_level(root.hdr); btree_node_full_unlock(cc, cfg, &root); diff --git a/src/mini_allocator.c b/src/mini_allocator.c index 75684e0a..1099641f 100644 --- a/src/mini_allocator.c +++ b/src/mini_allocator.c @@ -943,35 +943,28 @@ mini_meta_cursor_init(mini_meta_cursor *cursor, cursor->entry_idx = 0; cursor->num_entries = 0; - while (TRUE) { - if (cursor->meta_addr == 0) { - return MINI_META_CURSOR_END; - } - - cursor->meta_page = - cache_get(cursor->cc, cursor->meta_addr, FALSE, cursor->meta_type); - if (cursor->meta_page == NULL) { - cache_prefetch_page(cursor->cc, cursor->meta_addr, cursor->meta_type); - return MINI_META_CURSOR_WOULD_BLOCK; - } + cursor->meta_page = + cache_get(cursor->cc, cursor->meta_addr, FALSE, cursor->meta_type); + if (cursor->meta_page == NULL) { + cache_prefetch_page(cursor->cc, cursor->meta_addr, cursor->meta_type); + return MINI_META_CURSOR_WOULD_BLOCK; + } - cursor->num_entries = mini_num_entries(cursor->meta_page); - cursor->entry_idx = 0; - while (cursor->entry_idx < cursor->num_entries) { - meta_entry *entry = first_entry(cursor->meta_page) + cursor->entry_idx; - if (meta_entry_extent_addr(cursor->cc, entry) == target_extent_addr) { - return MINI_META_CURSOR_ENTRY; - } - cursor->entry_idx++; + cursor->num_entries = mini_num_entries(cursor->meta_page); + cursor->entry_idx = 0; + while (cursor->entry_idx < cursor->num_entries) { + meta_entry *entry = first_entry(cursor->meta_page) + cursor->entry_idx; + if (meta_entry_extent_addr(cursor->cc, entry) == target_extent_addr) { + return MINI_META_CURSOR_ENTRY; } - - uint64 next_meta_addr = mini_get_next_meta_addr(cursor->meta_page); - cache_unget(cursor->cc, cursor->meta_page); - cursor->meta_page = NULL; - cursor->meta_addr = next_meta_addr; - cursor->entry_idx = 0; - cursor->num_entries = 0; + cursor->entry_idx++; } + + platform_assert(FALSE, + "target extent %lu not found on meta page %lu", + target_extent_addr, + meta_addr); + return MINI_META_CURSOR_END; } void diff --git a/src/mini_allocator.h b/src/mini_allocator.h index 8a05a32b..1a7995b7 100644 --- a/src/mini_allocator.h +++ b/src/mini_allocator.h @@ -138,14 +138,15 @@ typedef struct mini_meta_cursor { // Result of a non-blocking cursor operation. typedef enum mini_meta_cursor_status { MINI_META_CURSOR_ENTRY, // cursor is positioned on an entry - MINI_META_CURSOR_END, // stream exhausted, or requested entry absent + MINI_META_CURSOR_END, // stream exhausted MINI_META_CURSOR_WOULD_BLOCK, // needed meta page not resident (prefetch // issued) } mini_meta_cursor_status; -// Initialize cursor on target_extent_addr. Non-blocking: returns -// MINI_META_CURSOR_WOULD_BLOCK (and issues a prefetch for it) if a needed meta -// page is not yet resident. On MINI_META_CURSOR_ENTRY, curr is valid. +// Initialize cursor on target_extent_addr, which must be listed on meta_addr. +// Non-blocking: returns MINI_META_CURSOR_WOULD_BLOCK (and issues a prefetch for +// it) if meta_addr is not yet resident. On MINI_META_CURSOR_ENTRY, curr is +// valid. Asserts if target_extent_addr is not listed on meta_addr. mini_meta_cursor_status mini_meta_cursor_init(mini_meta_cursor *cursor, cache *cc, From b63421bd2d41498771b5920564609bfe78d3e219 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Fri, 26 Jun 2026 18:01:31 -0700 Subject: [PATCH 13/15] btree sibling traversal helpers cleanup Signed-off-by: Rob Johnson --- src/btree.c | 105 +++++++++++++++++++------------------------- src/btree_private.h | 44 +++++++++++++++++++ 2 files changed, 89 insertions(+), 60 deletions(-) diff --git a/src/btree.c b/src/btree.c index a9faf309..8e227e28 100644 --- a/src/btree.c +++ b/src/btree.c @@ -2661,15 +2661,14 @@ static inline uint64 btree_iterator_curr_live_prev_addr(btree_iterator *itor) { if (!btree_iterator_curr_is_copy(itor)) { - return btree_hdr_prev_addr( - itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type); + return btree_node_prev_addr(itor->cfg, &itor->curr, itor->page_type); } btree_node live_curr; live_curr.addr = itor->curr.addr; btree_node_get(itor->cc, itor->cfg, &live_curr, itor->page_type); - uint64 prev_addr = btree_hdr_prev_addr( - itor->cfg, live_curr.hdr, live_curr.addr, itor->page_type); + uint64 prev_addr = + btree_node_prev_addr(itor->cfg, &live_curr, itor->page_type); btree_node_unget(itor->cc, itor->cfg, &live_curr); return prev_addr; } @@ -2688,22 +2687,20 @@ btree_iterator_end_key_beyond_curr(btree_iterator *itor) uint64 num_entries = btree_num_entries(itor->curr.hdr); if (key_is_positive_infinity(itor->max_key)) { - return btree_hdr_next_addr( - itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) + return btree_node_next_addr(itor->cfg, &itor->curr, itor->page_type) != 0; } if (num_entries == 0 || itor->height > btree_height(itor->curr.hdr)) { return num_entries == 0 - && btree_hdr_next_addr( - itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) + && btree_node_next_addr( + itor->cfg, &itor->curr, itor->page_type) != 0; } key last_key = btree_iterator_get_node_key(itor, itor->curr.hdr, num_entries - 1); return btree_key_compare(itor->cfg, itor->max_key, last_key) > 0 - && btree_hdr_next_addr( - itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) + && btree_node_next_addr(itor->cfg, &itor->curr, itor->page_type) != 0; } @@ -3072,10 +3069,10 @@ btree_iterator_prefetch_on_advance(btree_iterator *itor, { uint64 extent_addr = going_forward - ? btree_hdr_next_extent_addr( - itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) - : btree_hdr_prev_extent_addr( - itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type); + ? btree_node_next_extent_addr( + itor->cfg, &itor->curr, itor->page_type) + : btree_node_prev_extent_addr( + itor->cfg, &itor->curr, itor->page_type); if (extent_addr != 0 && (!going_forward @@ -3097,8 +3094,8 @@ static void btree_iterator_next_leaf(btree_iterator *itor) { uint64 last_addr = itor->curr.addr; - uint64 next_addr = btree_hdr_next_addr( - itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type); + uint64 next_addr = + btree_node_next_addr(itor->cfg, &itor->curr, itor->page_type); btree_iterator_release_curr(itor); btree_iterator_get_curr_addr(itor, next_addr); itor->idx = 0; @@ -3113,10 +3110,9 @@ btree_iterator_next_leaf_async(btree_iterator_async_state *state, uint64 depth) async_begin(state, depth); state->last_addr = state->itor->curr.addr; - state->next_addr = btree_hdr_next_addr(state->itor->cfg, - state->itor->curr.hdr, - state->itor->curr.addr, - state->itor->page_type); + state->next_addr = btree_node_next_addr(state->itor->cfg, + &state->itor->curr, + state->itor->page_type); btree_iterator_release_curr(state->itor); state->itor->curr.addr = state->next_addr; @@ -3166,12 +3162,10 @@ btree_iterator_prev_leaf(btree_iterator *itor) * old curr node and the new one. In this case, we can just walk * forward until we find the leaf whose successor is our old leaf. */ - while ( - btree_hdr_next_addr(cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) - != last_addr) + while (btree_node_next_addr(cfg, &itor->curr, itor->page_type) != last_addr) { - uint64 next_addr = btree_hdr_next_addr( - cfg, itor->curr.hdr, itor->curr.addr, itor->page_type); + uint64 next_addr = + btree_node_next_addr(cfg, &itor->curr, itor->page_type); btree_iterator_release_curr(itor); btree_iterator_get_curr_addr(itor, next_addr); } @@ -3187,9 +3181,7 @@ btree_iterator_prev_leaf(btree_iterator *itor) itor->curr_min_idx = find_key_in_node( itor, itor->curr.hdr, itor->min_key, itor->min_key_comparison, NULL); } - if (btree_hdr_prev_addr( - cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) - == 0 + if (btree_node_prev_addr(cfg, &itor->curr, itor->page_type) == 0 && itor->curr_min_idx == -1) { itor->curr_min_idx = 0; @@ -3217,16 +3209,14 @@ btree_iterator_prev_leaf_async(btree_iterator_async_state *state, uint64 depth) state->live_curr.page = cache_get_async_state_result(&state->cache_get_state); state->live_curr.hdr = (btree_hdr *)state->live_curr.page->data; - state->prev_addr = btree_hdr_prev_addr(state->itor->cfg, - state->live_curr.hdr, - state->live_curr.addr, - state->itor->page_type); + state->prev_addr = btree_node_prev_addr(state->itor->cfg, + &state->live_curr, + state->itor->page_type); btree_node_unget(state->itor->cc, state->itor->cfg, &state->live_curr); } else { - state->prev_addr = btree_hdr_prev_addr(state->itor->cfg, - state->itor->curr.hdr, - state->itor->curr.addr, - state->itor->page_type); + state->prev_addr = btree_node_prev_addr(state->itor->cfg, + &state->itor->curr, + state->itor->page_type); } btree_iterator_release_curr(state->itor); state->itor->curr.addr = state->prev_addr; @@ -3249,16 +3239,14 @@ btree_iterator_prev_leaf_async(btree_iterator_async_state *state, uint64 depth) * old curr node and the new one. In this case, we can just walk * forward until we find the leaf whose successor is our old leaf. */ - while (btree_hdr_next_addr(state->itor->cfg, - state->itor->curr.hdr, - state->itor->curr.addr, - state->itor->page_type) + while (btree_node_next_addr(state->itor->cfg, + &state->itor->curr, + state->itor->page_type) != state->curr_addr) { - state->next_addr = btree_hdr_next_addr(state->itor->cfg, - state->itor->curr.hdr, - state->itor->curr.addr, - state->itor->page_type); + state->next_addr = btree_node_next_addr(state->itor->cfg, + &state->itor->curr, + state->itor->page_type); btree_iterator_release_curr(state->itor); state->itor->curr.addr = state->next_addr; @@ -3296,10 +3284,9 @@ btree_iterator_prev_leaf_async(btree_iterator_async_state *state, uint64 depth) state->itor->min_key_comparison, NULL); } - if (btree_hdr_prev_addr(state->itor->cfg, - state->itor->curr.hdr, - state->itor->curr.addr, - state->itor->page_type) + if (btree_node_prev_addr(state->itor->cfg, + &state->itor->curr, + state->itor->page_type) == 0 && state->itor->curr_min_idx == -1) { @@ -3467,8 +3454,7 @@ find_btree_node_and_get_idx_bounds(btree_iterator *itor, // if min_key is not within the current node but there is no previous node // then set curr_min_idx to 0 if (itor->curr_min_idx == -1 - && btree_hdr_prev_addr( - itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type) + && btree_node_prev_addr(itor->cfg, &itor->curr, itor->page_type) == 0) { itor->curr_min_idx = 0; @@ -3561,10 +3547,9 @@ find_btree_node_and_get_idx_bounds_async(btree_iterator_async_state *state, // if min_key is not within the current node but there is no previous node // then set curr_min_idx to 0 if (state->itor->curr_min_idx == -1 - && btree_hdr_prev_addr(state->itor->cfg, - state->itor->curr.hdr, - state->itor->curr.addr, - state->itor->page_type) + && btree_node_prev_addr(state->itor->cfg, + &state->itor->curr, + state->itor->page_type) == 0) { state->itor->curr_min_idx = 0; @@ -3776,8 +3761,8 @@ btree_iterator_init(cache *cc, btree_prefetch_cursor_start(itor, TRUE); // While the deep cursor is priming or disabled, keep the next forward extent // warm when the leaf header names one. - uint64 next_extent_addr = btree_hdr_next_extent_addr( - itor->cfg, itor->curr.hdr, itor->curr.addr, itor->page_type); + uint64 next_extent_addr = + btree_node_next_extent_addr(itor->cfg, &itor->curr, itor->page_type); if (itor->prefetch.state != BTREE_PREFETCH_ACTIVE && btree_iterator_prefetch_enabled(itor) && next_extent_addr != 0 && !btree_addrs_share_extent(cc, itor->curr.addr, itor->end_addr)) @@ -3821,10 +3806,10 @@ btree_iterator_init_async(btree_iterator_async_state *state) btree_prefetch_cursor_start(state->itor, TRUE); // While the deep cursor is priming or disabled, keep the next forward extent // warm when the leaf header names one. - uint64 next_extent_addr = btree_hdr_next_extent_addr(state->itor->cfg, - state->itor->curr.hdr, - state->itor->curr.addr, - state->itor->page_type); + uint64 next_extent_addr = + btree_node_next_extent_addr(state->itor->cfg, + &state->itor->curr, + state->itor->page_type); if (state->itor->prefetch.state != BTREE_PREFETCH_ACTIVE && btree_iterator_prefetch_enabled(state->itor) && next_extent_addr != 0 && !btree_addrs_share_extent( diff --git a/src/btree_private.h b/src/btree_private.h index 6f692eb2..3b40a92d 100644 --- a/src/btree_private.h +++ b/src/btree_private.h @@ -294,6 +294,50 @@ btree_hdr_prev_extent_addr(const btree_config *cfg, return hdr->prev_addr; } +static inline uint64 +btree_node_next_addr(const btree_config *cfg, + const btree_node *node, + page_type type) +{ + platform_assert(node != NULL); + platform_assert(node->hdr != NULL); + + return btree_hdr_next_addr(cfg, node->hdr, node->addr, type); +} + +static inline uint64 +btree_node_prev_addr(const btree_config *cfg, + const btree_node *node, + page_type type) +{ + platform_assert(node != NULL); + platform_assert(node->hdr != NULL); + + return btree_hdr_prev_addr(cfg, node->hdr, node->addr, type); +} + +static inline uint64 +btree_node_next_extent_addr(const btree_config *cfg, + const btree_node *node, + page_type type) +{ + platform_assert(node != NULL); + platform_assert(node->hdr != NULL); + + return btree_hdr_next_extent_addr(cfg, node->hdr, node->addr, type); +} + +static inline uint64 +btree_node_prev_extent_addr(const btree_config *cfg, + const btree_node *node, + page_type type) +{ + platform_assert(node != NULL); + platform_assert(node->hdr != NULL); + + return btree_hdr_prev_extent_addr(cfg, node->hdr, node->addr, type); +} + static inline void btree_init_hdr(const btree_config *cfg, btree_hdr *hdr) { From e3b86e3e45cad0a58020ed304f2be42db1a47196 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Fri, 26 Jun 2026 18:57:11 -0700 Subject: [PATCH 14/15] strengthen prefetch invariants Signed-off-by: Rob Johnson --- src/btree.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/btree.c b/src/btree.c index 8e227e28..e7d46977 100644 --- a/src/btree.c +++ b/src/btree.c @@ -2948,12 +2948,7 @@ btree_prefetch_cursor_pump(btree_iterator *itor) if (status == MINI_META_CURSOR_WOULD_BLOCK) { return FALSE; } - if (status != MINI_META_CURSOR_ENTRY) { - // No extent list entry is available; use the header extent links. - mini_meta_cursor_deinit(&pf->meta_cursor); - pf->state = BTREE_PREFETCH_DISABLED; - return FALSE; - } + platform_assert(status == MINI_META_CURSOR_ENTRY); pf->state = BTREE_PREFETCH_ACTIVE; pf->at_end = FALSE; From f1852f8e418fe5987e42045ac168b912ab1e7808 Mon Sep 17 00:00:00 2001 From: Rob Johnson Date: Fri, 26 Jun 2026 18:57:20 -0700 Subject: [PATCH 15/15] formatting Signed-off-by: Rob Johnson --- src/btree.c | 64 +++++++++++++++++++++-------------------------------- 1 file changed, 25 insertions(+), 39 deletions(-) diff --git a/src/btree.c b/src/btree.c index e7d46977..6299f92b 100644 --- a/src/btree.c +++ b/src/btree.c @@ -2687,21 +2687,18 @@ btree_iterator_end_key_beyond_curr(btree_iterator *itor) uint64 num_entries = btree_num_entries(itor->curr.hdr); if (key_is_positive_infinity(itor->max_key)) { - return btree_node_next_addr(itor->cfg, &itor->curr, itor->page_type) - != 0; + return btree_node_next_addr(itor->cfg, &itor->curr, itor->page_type) != 0; } if (num_entries == 0 || itor->height > btree_height(itor->curr.hdr)) { return num_entries == 0 - && btree_node_next_addr( - itor->cfg, &itor->curr, itor->page_type) + && btree_node_next_addr(itor->cfg, &itor->curr, itor->page_type) != 0; } key last_key = btree_iterator_get_node_key(itor, itor->curr.hdr, num_entries - 1); return btree_key_compare(itor->cfg, itor->max_key, last_key) > 0 - && btree_node_next_addr(itor->cfg, &itor->curr, itor->page_type) - != 0; + && btree_node_next_addr(itor->cfg, &itor->curr, itor->page_type) != 0; } static void @@ -3062,12 +3059,11 @@ btree_iterator_prefetch_on_advance(btree_iterator *itor, } else if (btree_iterator_prefetch_enabled(itor) && (crossed_extent || restarted)) { - uint64 extent_addr = - going_forward - ? btree_node_next_extent_addr( - itor->cfg, &itor->curr, itor->page_type) - : btree_node_prev_extent_addr( - itor->cfg, &itor->curr, itor->page_type); + uint64 extent_addr = going_forward + ? btree_node_next_extent_addr( + itor->cfg, &itor->curr, itor->page_type) + : btree_node_prev_extent_addr( + itor->cfg, &itor->curr, itor->page_type); if (extent_addr != 0 && (!going_forward @@ -3105,9 +3101,8 @@ btree_iterator_next_leaf_async(btree_iterator_async_state *state, uint64 depth) async_begin(state, depth); state->last_addr = state->itor->curr.addr; - state->next_addr = btree_node_next_addr(state->itor->cfg, - &state->itor->curr, - state->itor->page_type); + state->next_addr = btree_node_next_addr( + state->itor->cfg, &state->itor->curr, state->itor->page_type); btree_iterator_release_curr(state->itor); state->itor->curr.addr = state->next_addr; @@ -3204,14 +3199,12 @@ btree_iterator_prev_leaf_async(btree_iterator_async_state *state, uint64 depth) state->live_curr.page = cache_get_async_state_result(&state->cache_get_state); state->live_curr.hdr = (btree_hdr *)state->live_curr.page->data; - state->prev_addr = btree_node_prev_addr(state->itor->cfg, - &state->live_curr, - state->itor->page_type); + state->prev_addr = btree_node_prev_addr( + state->itor->cfg, &state->live_curr, state->itor->page_type); btree_node_unget(state->itor->cc, state->itor->cfg, &state->live_curr); } else { - state->prev_addr = btree_node_prev_addr(state->itor->cfg, - &state->itor->curr, - state->itor->page_type); + state->prev_addr = btree_node_prev_addr( + state->itor->cfg, &state->itor->curr, state->itor->page_type); } btree_iterator_release_curr(state->itor); state->itor->curr.addr = state->prev_addr; @@ -3234,14 +3227,12 @@ btree_iterator_prev_leaf_async(btree_iterator_async_state *state, uint64 depth) * old curr node and the new one. In this case, we can just walk * forward until we find the leaf whose successor is our old leaf. */ - while (btree_node_next_addr(state->itor->cfg, - &state->itor->curr, - state->itor->page_type) + while (btree_node_next_addr( + state->itor->cfg, &state->itor->curr, state->itor->page_type) != state->curr_addr) { - state->next_addr = btree_node_next_addr(state->itor->cfg, - &state->itor->curr, - state->itor->page_type); + state->next_addr = btree_node_next_addr( + state->itor->cfg, &state->itor->curr, state->itor->page_type); btree_iterator_release_curr(state->itor); state->itor->curr.addr = state->next_addr; @@ -3279,9 +3270,8 @@ btree_iterator_prev_leaf_async(btree_iterator_async_state *state, uint64 depth) state->itor->min_key_comparison, NULL); } - if (btree_node_prev_addr(state->itor->cfg, - &state->itor->curr, - state->itor->page_type) + if (btree_node_prev_addr( + state->itor->cfg, &state->itor->curr, state->itor->page_type) == 0 && state->itor->curr_min_idx == -1) { @@ -3449,8 +3439,7 @@ find_btree_node_and_get_idx_bounds(btree_iterator *itor, // if min_key is not within the current node but there is no previous node // then set curr_min_idx to 0 if (itor->curr_min_idx == -1 - && btree_node_prev_addr(itor->cfg, &itor->curr, itor->page_type) - == 0) + && btree_node_prev_addr(itor->cfg, &itor->curr, itor->page_type) == 0) { itor->curr_min_idx = 0; } @@ -3542,9 +3531,8 @@ find_btree_node_and_get_idx_bounds_async(btree_iterator_async_state *state, // if min_key is not within the current node but there is no previous node // then set curr_min_idx to 0 if (state->itor->curr_min_idx == -1 - && btree_node_prev_addr(state->itor->cfg, - &state->itor->curr, - state->itor->page_type) + && btree_node_prev_addr( + state->itor->cfg, &state->itor->curr, state->itor->page_type) == 0) { state->itor->curr_min_idx = 0; @@ -3801,10 +3789,8 @@ btree_iterator_init_async(btree_iterator_async_state *state) btree_prefetch_cursor_start(state->itor, TRUE); // While the deep cursor is priming or disabled, keep the next forward extent // warm when the leaf header names one. - uint64 next_extent_addr = - btree_node_next_extent_addr(state->itor->cfg, - &state->itor->curr, - state->itor->page_type); + uint64 next_extent_addr = btree_node_next_extent_addr( + state->itor->cfg, &state->itor->curr, state->itor->page_type); if (state->itor->prefetch.state != BTREE_PREFETCH_ACTIVE && btree_iterator_prefetch_enabled(state->itor) && next_extent_addr != 0 && !btree_addrs_share_extent(