diff --git a/benchmarks/benchmark_utils.hpp b/benchmarks/benchmark_utils.hpp index e0acc0bb9..2f02ec3b0 100644 --- a/benchmarks/benchmark_utils.hpp +++ b/benchmarks/benchmark_utils.hpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -65,7 +66,9 @@ struct lazy_discard { __device__ void device_dispatch(index_type index, value_type const& value) const { // pick some predicate that is always false, but depends on the runtime value - if (threadIdx.x > 2025 + *reinterpret_cast(&value)) { *(it + index) = value; } + if (threadIdx.x > 2025 + *reinterpret_cast(&value) + static_cast(index)) { + *(it + index) = value; + } } __host__ __device__ void operator()(index_type index, value_type const& value) const { @@ -94,6 +97,21 @@ NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::gaussian, "GAUSSIAN", "distribution::gaussian"); +NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::XXHash_64, "xxhash_64", "cuco::xxhash_64"); +NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::XXHash_32, "xxhash_32", "cuco::xxhash_32"); +NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::MurmurHash3_32, + "murmurhash3_32", + "cuco::murmurhash3_32"); +NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::MurmurHash3_x86_128, + "murmurhash3_x86_128", + "cuco::murmurhash3_x86_128"); +NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::MurmurHash3_x64_128, + "murmurhash3_x64_128", + "cuco::murmurhash3_x64_128"); +NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::identity_hash, + "identity_hash", + "cuco::identity_hash"); + #if defined(CUCO_HAS_128BIT_ATOMICS) NVBENCH_DECLARE_TYPE_STRINGS(__int128_t, "I128", "__int128_t"); #endif diff --git a/benchmarks/bloom_filter/add_bench.cu b/benchmarks/bloom_filter/add_bench.cu index b07c285d4..28e1c4182 100644 --- a/benchmarks/bloom_filter/add_bench.cu +++ b/benchmarks/bloom_filter/add_bench.cu @@ -15,7 +15,6 @@ */ #include "defaults.hpp" -#include "utils.hpp" #include #include @@ -24,143 +23,121 @@ #include -#include #include +#include +#include +#include #include #include #include -using namespace cuco::benchmark; // defaults, dist_from_state, rebind_hasher_t, add_fpr_summary +using namespace cuco::benchmark; // defaults, dist_from_state, rebind_hasher_t using namespace cuco::utility; // key_generator, distribution /** * @brief A benchmark evaluating `cuco::bloom_filter::add_async` performance */ -template +template void bloom_filter_add(nvbench::state& state, - nvbench::type_list, Dist>) + nvbench::type_list, + nvbench::enum_type, + nvbench::enum_type, + nvbench::enum_type>) { - using size_type = std::uint32_t; - using policy_type = cuco::default_filter_policy, - Word, - static_cast(WordsPerBlock)>; - using filter_type = - cuco::bloom_filter, cuda::thread_scope_device, policy_type>; - - constexpr auto filter_block_size = - sizeof(typename filter_type::word_type) * filter_type::words_per_block; - - auto const num_keys = state.get_int64("NumInputs"); - auto const filter_size_mb = state.get_int64("FilterSizeMB"); - auto const pattern_bits = WordsPerBlock; - - try { - [[maybe_unused]] auto const policy = policy_type{static_cast(pattern_bits)}; - } catch (std::exception const& e) { - state.skip(e.what()); // skip invalid configurations + auto constexpr words_per_block = BlockBits / cuda::std::numeric_limits::digits; + auto constexpr pattern_bits_per_word = PatternBits / words_per_block; + + // Check for a valid configuration + if constexpr ((not cuda::std::has_single_bit(static_cast(BlockBits))) or + (words_per_block == 0)) { + state.skip("Invalid filter block size"); + } else if constexpr (HorizontalLayout * VerticalLayout != words_per_block) { + state.skip("Invalid vectorization layout"); + } else if constexpr ((pattern_bits_per_word <= 0) or + (pattern_bits_per_word > cuda::std::numeric_limits::digits) or + (pattern_bits_per_word * words_per_block > 64)) { + state.skip("Invalid pattern bits per word"); + } else { + using size_type = std::uint32_t; + using hasher = cuco::xxhash_64; + auto constexpr contains_vertical_layout = words_per_block; + auto constexpr contains_horizontal_layout = 1; + using policy_type = cuco::parametric_filter_policy; + using filter_type = + cuco::bloom_filter, cuda::thread_scope_device, policy_type>; + + auto const num_keys = state.get_int64("NumInputs"); + auto const filter_size_mb = state.get_int64("FilterSizeMB"); + + std::size_t const num_sub_filters = + (filter_size_mb * 1024 * 1024) / + (sizeof(typename filter_type::word_type) * filter_type::words_per_block); + + if (num_sub_filters > policy_type::max_filter_blocks) { + // skip invalid configurations + state.skip("num_sub_filters exceeds max_filter_blocks"); + } + + state.add_element_count(num_keys); + + filter_type filter{static_cast(num_sub_filters)}; + + thrust::device_vector keys(num_keys); + thrust::sequence(thrust::device, keys.begin(), keys.end(), 0); + + state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + timer.start(); + filter.add_async(keys.begin(), keys.end(), {launch.get_stream()}); + timer.stop(); + filter.clear_async({launch.get_stream()}); + }); } - - std::size_t const num_sub_filters = (filter_size_mb * 1024 * 1024) / filter_block_size; - - if (num_sub_filters > std::numeric_limits::max()) { - state.skip("num_sub_filters too large for size_type"); // skip invalid configurations - } - - cuda::counting_iterator keys(0); - - state.add_element_count(num_keys); - - filter_type filter{ - static_cast(num_sub_filters), {}, {static_cast(pattern_bits)}}; - - state.collect_dram_throughput(); - state.collect_l2_hit_rates(); - - add_fpr_summary(state, filter); - - state.exec([&](nvbench::launch& launch) { - filter.add_async(keys, keys + num_keys, {launch.get_stream()}); - }); -} - -/** - * @brief A benchmark evaluating `cuco::bloom_filter::add_async` performance with - * `arrow_filter_policy` - */ -template -void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list) -{ - using size_type = std::uint32_t; - using policy_type = cuco::arrow_filter_policy; - using filter_type = - cuco::bloom_filter, cuda::thread_scope_device, policy_type>; - - auto const num_keys = state.get_int64("NumInputs"); - auto const filter_size_mb = state.get_int64("FilterSizeMB"); - - std::size_t const num_sub_filters = - (filter_size_mb * 1024 * 1024) / - (sizeof(typename filter_type::word_type) * filter_type::words_per_block); - - if (num_sub_filters > policy_type::max_filter_blocks) { - state.skip("bloom filter with arrow policy should have <= 4194304 blocks"); // skip invalid - // configurations - } - - cuda::counting_iterator keys(0); - - state.add_element_count(num_keys); - - filter_type filter{static_cast(num_sub_filters)}; - - state.collect_dram_throughput(); - state.collect_l2_hit_rates(); - - add_fpr_summary(state, filter); - - state.exec([&](nvbench::launch& launch) { - filter.add_async(keys, keys + num_keys, {launch.get_stream()}); - }); } +// Default benchmark: single layout matching `cuco::default_filter_policy` (256-bit block, +// 8-bit fingerprint, fully horizontal add) swept across the standard FilterSizeMB range. NVBENCH_BENCH_TYPES(bloom_filter_add, NVBENCH_TYPE_AXES(nvbench::type_list, - nvbench::type_list, - nvbench::type_list, - nvbench::enum_type_list, - nvbench::type_list)) + nvbench::type_list, ///< Word + nvbench::enum_type_list<256>, ///< BlockBits + nvbench::enum_type_list<8>, ///< PatternBits + nvbench::enum_type_list<8>, ///< HorizontalLayout + nvbench::enum_type_list<1> ///< VerticalLayout + )) .set_name("bloom_filter_add_unique_size") - .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"}) + .set_type_axes_names( + {"Key", "Word", "BlockBits", "PatternBits", "HorizontalLayout", "VerticalLayout"}) .add_int64_axis("NumInputs", {defaults::BF_N}) .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE); -NVBENCH_BENCH_TYPES(bloom_filter_add, - NVBENCH_TYPE_AXES(nvbench::type_list, - defaults::HASH_RANGE, - nvbench::type_list, - nvbench::enum_type_list, - nvbench::type_list)) - .set_name("bloom_filter_add_unique_hash") - .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"}) - .add_int64_axis("NumInputs", {defaults::BF_N}) - .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB}); - -NVBENCH_BENCH_TYPES(bloom_filter_add, - NVBENCH_TYPE_AXES(nvbench::type_list, - nvbench::type_list, - nvbench::type_list, - nvbench::enum_type_list<1, 2, 4, 8>, - nvbench::type_list)) - .set_name("bloom_filter_add_unique_block_dim") - .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"}) - .add_int64_axis("NumInputs", {defaults::BF_N}) - .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB}); - -NVBENCH_BENCH_TYPES(arrow_bloom_filter_add, - NVBENCH_TYPE_AXES(nvbench::type_list, - nvbench::type_list)) - .set_name("arrow_bloom_filter_add_unique_size") - .set_type_axes_names({"Key", "Distribution"}) - .add_int64_axis("NumInputs", {defaults::BF_N}) - .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE); +// Exhaustive sweep across block sizes and vectorization layouts. Uncomment for performance +// tuning / paper-style characterization; not run by default because the matrix is large. +// NVBENCH_BENCH_TYPES( +// bloom_filter_add, +// NVBENCH_TYPE_AXES(nvbench::type_list, +// nvbench::type_list, ///< Word +// nvbench::enum_type_list<64, 128, 256, 512, 1024>, ///< BlockBits +// nvbench::enum_type_list<8, 16>, ///< PatternBits +// nvbench::enum_type_list<1, 2, 4, 8, 16>, ///< +// HorizontalLayout nvbench::enum_type_list<1, 2, 4, 8, 16> ///< VerticalLayout +// )) +// .set_name("bloom_filter_add_full_sweep_u64") +// .set_type_axes_names( +// {"Key", "Word", "BlockBits", "PatternBits", "HorizontalLayout", "VerticalLayout"}) +// .add_int64_axis("NumInputs", {defaults::BF_N}) +// .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE); \ No newline at end of file diff --git a/benchmarks/bloom_filter/contains_bench.cu b/benchmarks/bloom_filter/contains_bench.cu index 7a4549524..ba9c59957 100644 --- a/benchmarks/bloom_filter/contains_bench.cu +++ b/benchmarks/bloom_filter/contains_bench.cu @@ -15,7 +15,6 @@ */ #include "defaults.hpp" -#include "utils.hpp" #include #include @@ -24,157 +23,143 @@ #include -#include #include +#include #include +#include +#include #include #include -using namespace cuco::benchmark; // defaults, dist_from_state, rebind_hasher_t, add_fpr_summary +using namespace cuco::benchmark; // defaults, dist_from_state, rebind_hasher_t using namespace cuco::utility; // key_generator, distribution /** * @brief A benchmark evaluating `cuco::bloom_filter::contains_async` performance */ -template -void bloom_filter_contains( - nvbench::state& state, - nvbench::type_list, Dist>) +template +void bloom_filter_contains(nvbench::state& state, + nvbench::type_list, + nvbench::enum_type, + nvbench::enum_type, + nvbench::enum_type>) { - using size_type = std::uint32_t; - using policy_type = cuco::default_filter_policy, - Word, - static_cast(WordsPerBlock)>; - using filter_type = - cuco::bloom_filter, cuda::thread_scope_device, policy_type>; - - constexpr auto filter_block_size = - sizeof(typename filter_type::word_type) * filter_type::words_per_block; - - // if (filter_block_size <= 32) { - // cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if - // filter block fits into a 32B sector - // } - - auto const num_keys = state.get_int64("NumInputs"); - auto const filter_size_mb = state.get_int64("FilterSizeMB"); - auto const pattern_bits = WordsPerBlock; - - try { - [[maybe_unused]] auto const policy = policy_type{static_cast(pattern_bits)}; - } catch (std::exception const& e) { - state.skip(e.what()); // skip invalid configurations + auto constexpr words_per_block = BlockBits / cuda::std::numeric_limits::digits; + auto constexpr pattern_bits_per_word = PatternBits / words_per_block; + + // Check for a valid configuration + if constexpr ((not cuda::std::has_single_bit(static_cast(BlockBits))) or + (words_per_block == 0)) { + state.skip("Invalid filter block size"); + } else if constexpr (HorizontalLayout * VerticalLayout > words_per_block) { + state.skip("Invalid vectorization layout"); // TODO check if this is correct + } else if constexpr ((pattern_bits_per_word <= 0) or + (pattern_bits_per_word > cuda::std::numeric_limits::digits) or + (pattern_bits_per_word * words_per_block > 64)) { + state.skip("Invalid pattern bits per word"); + } else { + using size_type = std::uint32_t; + using hasher = cuco::xxhash_64; + auto constexpr add_vertical_layout = 1; + auto constexpr add_horizontal_layout = words_per_block; + using policy_type = cuco::parametric_filter_policy; + using filter_type = + cuco::bloom_filter, cuda::thread_scope_device, policy_type>; + + auto const num_keys = state.get_int64("NumInputs"); + auto const filter_size_mb = state.get_int64("FilterSizeMB"); + + std::size_t const num_sub_filters = + (filter_size_mb * 1024 * 1024) / + (sizeof(typename filter_type::word_type) * filter_type::words_per_block); + + if (num_sub_filters > policy_type::max_filter_blocks) { + // skip invalid configurations + state.skip("num_sub_filters exceeds max_filter_blocks"); + } + + state.add_element_count(num_keys); + + filter_type filter{static_cast(num_sub_filters)}; + + thrust::counting_iterator key_it(0); + + // insert FPR-optimal number of keys + auto const num_build_keys = (filter_size_mb * 1024 * 1024 * 8) / (2 * PatternBits); + filter.add(key_it, key_it + num_build_keys); + + // FPR summary + thrust::device_vector result(num_keys, false); + filter.contains(key_it + num_build_keys, key_it + num_build_keys + num_keys, result.begin()); + + double const fp = thrust::count(thrust::device, result.begin(), result.end(), true); + + auto& summ_fpr = state.add_summary("FalsePositiveRate"); + summ_fpr.set_string("hint", "FPR"); + summ_fpr.set_string("short_name", "FPR"); + summ_fpr.set_string("description", "False-positive rate of the bloom filter."); + summ_fpr.set_float64("value", fp / static_cast(num_keys)); + + state.collect_dram_throughput(); + state.collect_l2_hit_rates(); + + thrust::device_vector keys(num_keys); + thrust::sequence(thrust::device, keys.begin(), keys.end(), 0); + + state.add_global_memory_reads(num_keys * + ((words_per_block * sizeof(Word)) + sizeof(Key))); + state.add_global_memory_writes(num_keys * sizeof(bool)); + + state.exec([&](nvbench::launch& launch) { + filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()}); + }); } - - std::size_t const num_sub_filters = (filter_size_mb * 1024 * 1024) / filter_block_size; - - if (num_sub_filters > std::numeric_limits::max()) { - state.skip("num_sub_filters too large for size_type"); // skip invalid configurations - } - - cuda::counting_iterator keys(0); - thrust::device_vector result(num_keys, false); - - state.add_element_count(num_keys); - - filter_type filter{ - static_cast(num_sub_filters), {}, {static_cast(pattern_bits)}}; - - state.collect_dram_throughput(); - state.collect_l2_hit_rates(); - - add_fpr_summary(state, filter); - - filter.add(keys, keys + num_keys); - - state.exec([&](nvbench::launch& launch) { - filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()}); - }); -} - -/** - * @brief A benchmark evaluating `cuco::bloom_filter::contains_async` performance with - * `arrow_filter_policy` - */ -template -void arrow_bloom_filter_contains(nvbench::state& state, nvbench::type_list) -{ - // cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if - // filter block fits into a 32B sector - using size_type = std::uint32_t; - using policy_type = cuco::arrow_filter_policy; - using filter_type = - cuco::bloom_filter, cuda::thread_scope_device, policy_type>; - - auto const num_keys = state.get_int64("NumInputs"); - auto const filter_size_mb = state.get_int64("FilterSizeMB"); - - std::size_t const num_sub_filters = - (filter_size_mb * 1024 * 1024) / - (sizeof(typename filter_type::word_type) * filter_type::words_per_block); - - if (num_sub_filters > policy_type::max_filter_blocks) { - state.skip("bloom filter with arrow policy should have <= 4194304 blocks"); // skip invalid - // configurations - } - - cuda::counting_iterator keys(0); - thrust::device_vector result(num_keys, false); - - state.add_element_count(num_keys); - - filter_type filter{static_cast(num_sub_filters)}; - - state.collect_dram_throughput(); - state.collect_l2_hit_rates(); - - add_fpr_summary(state, filter); - - filter.add(keys, keys + num_keys); - - state.exec([&](nvbench::launch& launch) { - filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()}); - }); } +// Default benchmark: single layout matching `cuco::default_filter_policy` (256-bit block, +// 8-bit fingerprint, fully vertical contains) swept across the standard FilterSizeMB range. NVBENCH_BENCH_TYPES(bloom_filter_contains, NVBENCH_TYPE_AXES(nvbench::type_list, - nvbench::type_list, - nvbench::type_list, - nvbench::enum_type_list, - nvbench::type_list)) + nvbench::type_list, ///< Word + nvbench::enum_type_list<256>, ///< BlockBits + nvbench::enum_type_list<8>, ///< PatternBits + nvbench::enum_type_list<1>, ///< HorizontalLayout + nvbench::enum_type_list<8> ///< VerticalLayout + )) .set_name("bloom_filter_contains_unique_size") - .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"}) + .set_type_axes_names( + {"Key", "Word", "BlockBits", "PatternBits", "HorizontalLayout", "VerticalLayout"}) .add_int64_axis("NumInputs", {defaults::BF_N}) .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE); -NVBENCH_BENCH_TYPES(bloom_filter_contains, - NVBENCH_TYPE_AXES(nvbench::type_list, - defaults::HASH_RANGE, - nvbench::type_list, - nvbench::enum_type_list, - nvbench::type_list)) - .set_name("bloom_filter_contains_unique_hash") - .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"}) - .add_int64_axis("NumInputs", {defaults::BF_N}) - .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB}); - -NVBENCH_BENCH_TYPES(bloom_filter_contains, - NVBENCH_TYPE_AXES(nvbench::type_list, - nvbench::type_list, - nvbench::type_list, - nvbench::enum_type_list<1, 2, 4, 8>, - nvbench::type_list)) - .set_name("bloom_filter_contains_unique_block_dim") - .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"}) - .add_int64_axis("NumInputs", {defaults::BF_N}) - .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB}); - -NVBENCH_BENCH_TYPES(arrow_bloom_filter_contains, - NVBENCH_TYPE_AXES(nvbench::type_list, - nvbench::type_list)) - .set_name("arrow_bloom_filter_contains_unique_size") - .set_type_axes_names({"Key", "Distribution"}) - .add_int64_axis("NumInputs", {defaults::BF_N}) - .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE); +// Exhaustive sweep across block sizes and vectorization layouts. Uncomment for performance +// tuning / paper-style characterization; not run by default because the matrix is large. +// NVBENCH_BENCH_TYPES( +// bloom_filter_contains, +// NVBENCH_TYPE_AXES(nvbench::type_list, +// nvbench::type_list, ///< Word +// nvbench::enum_type_list<64, 128, 256, 512, 1024>, ///< BlockBits +// nvbench::enum_type_list<8, 16>, ///< PatternBits +// nvbench::enum_type_list<1, 2, 4, 8, 16>, ///< +// HorizontalLayout nvbench::enum_type_list<1, 2, 4, 8, 16> ///< VerticalLayout +// )) +// .set_name("bloom_filter_contains_full_sweep_u64") +// .set_type_axes_names( +// {"Key", "Word", "BlockBits", "PatternBits", "HorizontalLayout", "VerticalLayout"}) +// .add_int64_axis("NumInputs", {defaults::BF_N}) +// .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE); \ No newline at end of file diff --git a/benchmarks/bloom_filter/defaults.hpp b/benchmarks/bloom_filter/defaults.hpp index f1b192aa3..75d0c6ac6 100644 --- a/benchmarks/bloom_filter/defaults.hpp +++ b/benchmarks/bloom_filter/defaults.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,26 +16,17 @@ #pragma once -#include - #include -#include - #include namespace cuco::benchmark::defaults { -using BF_KEY = nvbench::int64_t; -using BF_HASH = cuco::xxhash_64; -using BF_WORD = nvbench::uint32_t; +using BF_KEY = nvbench::int64_t; -static constexpr auto BF_N = 1'000'000'000; -static constexpr auto BF_SIZE_MB = 2'000; -static constexpr auto BF_WORDS_PER_BLOCK = 8; +static constexpr auto BF_N = 1'000'000'000; auto const BF_SIZE_MB_RANGE_CACHE = std::vector{1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048}; -auto const BF_PATTERN_BITS_RANGE = std::vector{1, 2, 4, 6, 8, 16}; } // namespace cuco::benchmark::defaults diff --git a/benchmarks/bloom_filter/utils.hpp b/benchmarks/bloom_filter/utils.hpp index cec7e06c3..9273d8fcc 100644 --- a/benchmarks/bloom_filter/utils.hpp +++ b/benchmarks/bloom_filter/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,21 +27,6 @@ #include -NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::XXHash_64, "xxhash_64", "cuco::xxhash_64"); -NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::XXHash_32, "xxhash_32", "cuco::xxhash_32"); -NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::MurmurHash3_32, - "murmurhash3_32", - "cuco::murmurhash3_32"); -NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::MurmurHash3_x86_128, - "murmurhash3_x86_128", - "cuco::murmurhash3_x86_128"); -NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::MurmurHash3_x64_128, - "murmurhash3_x64_128", - "cuco::murmurhash3_x64_128"); -NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::identity_hash, - "identity_hash", - "cuco::identity_hash"); - namespace cuco::benchmark { template diff --git a/include/cuco/bloom_filter.cuh b/include/cuco/bloom_filter.cuh index 3f2e69c7c..c3344917d 100644 --- a/include/cuco/bloom_filter.cuh +++ b/include/cuco/bloom_filter.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,13 +36,13 @@ namespace cuco { /** - * @brief A GPU-accelerated Blocked Bloom Filter. + * @brief A GPU-accelerated Bloom filter. * * The `bloom_filter` supports two types of operations: * - Host-side "bulk" operations * - Device-side "singular" operations * - * The host-side bulk operations include add(), contains(), etc. These APIs should be used when + * The host-side bulk operations include `add()`, `contains()`, etc. These APIs should be used when * there are a large number of keys to add or lookup. For example, given a range of keys * specified by device-accessible iterators, the bulk `add` function will add all keys into * the filter. @@ -61,8 +61,8 @@ namespace cuco { template , cuda::thread_scope Scope = cuda::thread_scope_device, - class Policy = cuco::default_filter_policy, std::uint32_t, 8>, - class Allocator = cuco::cuda_allocator> + class Policy = cuco::default_filter_policy, + class Allocator = cuco::cuda_allocator> class bloom_filter { public: /** @@ -104,21 +104,17 @@ class bloom_filter { /** * @brief Constructs a statically-sized Bloom filter. * - * @note The total number of bits in the filter is determined by `words_per_block * num_blocks * - * sizeof(word_type) * CHAR_BIT`. - * * @param num_blocks Number of sub-filters or blocks * @param scope The scope in which operations will be performed * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policies.cuh`) * @param alloc Allocator used for allocating device-accessible storage * @param stream CUDA stream used to initialize the filter */ - __host__ explicit constexpr bloom_filter(Extent num_blocks, - cuda_thread_scope scope = {}, - Policy const& policy = {}, - Allocator const& alloc = {}, - cuda::stream_ref stream = cuda::stream_ref{ - cudaStream_t{nullptr}}); + __host__ explicit bloom_filter(Extent num_blocks, + cuda_thread_scope scope = {}, + Policy const& policy = {}, + Allocator const& alloc = {}, + cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}); /** * @brief Erases all information from the filter. @@ -164,8 +160,10 @@ class bloom_filter { * @param stream CUDA stream used for device memory operations and kernel launches */ template - __host__ constexpr void add_async( - InputIt first, InputIt last, cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}); + __host__ constexpr void add_async(InputIt first, + InputIt last, + cuda::stream_ref stream = cuda::stream_ref{ + cudaStream_t{nullptr}}) noexcept; /** * @brief Adds keys in the range `[first, last)` if `pred` of the corresponding `stencil` returns @@ -326,7 +324,7 @@ class bloom_filter { cudaStream_t{nullptr}}) const noexcept; /** - * @brief Merge another bloom filter into this. + * @brief Merge another bloom filter into `*this`. * * @note Modifies `this` in place. * @note This function synchronizes the given stream. For asynchronous execution use @@ -346,7 +344,7 @@ class bloom_filter { cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}); /** - * @brief Asynchronously merge another bloom filter into this. + * @brief Asynchronously merge another bloom filter into `*this`. * * @note Modifies `this` in place. * @@ -365,7 +363,7 @@ class bloom_filter { cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}); /** - * @brief Intersect another bloom filter into this. + * @brief Intersect another bloom filter into `*this`. * * @note Modifies `this` in place. * @note This function synchronizes the given stream. For asynchronous execution use @@ -389,7 +387,7 @@ class bloom_filter { cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}); /** - * @brief Asynchronously intersect another bloom filter into this. + * @brief Asynchronously intersect another bloom filter into `*this`. * * @note Modifies `this` in place. * diff --git a/include/cuco/bloom_filter_policies.cuh b/include/cuco/bloom_filter_policies.cuh index 0d28b166a..f2f1c0e73 100644 --- a/include/cuco/bloom_filter_policies.cuh +++ b/include/cuco/bloom_filter_policies.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,7 @@ #pragma once -#include -#include +#include #include #include @@ -25,110 +24,52 @@ namespace cuco { /** - * @brief A policy that defines how Arrow Block-Split Bloom Filter generates and stores a key's - * fingerprint. + * @brief Sectorized Bloom filter policy with multiplicative-hashing fingerprint generation. * - * @tparam Key The type of the values to generate a fingerprint for. - * @tparam XXHash64 Custom (64 bit) XXHash hasher to generate a key's fingerprint. - * By default, cuco::xxhash_64 hasher will be used. + * Implements the Sectorized Bloom Filter (SBF) variant from "Optimizing Bloom Filters for Modern + * GPU Architectures" (arXiv:2512.15595). * + * Requires a 64-bit hash function: the result is split into upper 32 bits (block selection via + * multiply-shift) and lower 32 bits (pattern generation). + * + * @tparam Hash 64-bit hash functor. + * @tparam Word Underlying word type of a filter block. + * @tparam WordsPerBlock Words per filter block. + * @tparam PatternBits Fingerprint bits per key (paper's k). + * @tparam AddHorizontalLayout CG size for add (paper's Theta). + * @tparam AddVerticalLayout Words per thread per add step (paper's Phi). + * @tparam ContainsHorizontalLayout CG size for contains. + * @tparam ContainsVerticalLayout Words per thread per contains step. */ -template class XXHash64 = cuco::xxhash_64> -using arrow_filter_policy = detail::arrow_filter_policy; +template +using parametric_filter_policy = detail::parametric_filter_policy; /** - * @brief The default policy that defines how a Blocked Bloom Filter generates and stores a key's - * fingerprint. + * @brief Default Bloom filter policy used by `cuco::bloom_filter` when no policy is specified. * - * @note `Word` type must be an atomically updatable integral type. `WordsPerBlock` must - * be a power-of-two. + * Alias for a `parametric_filter_policy` instantiation with paper-recommended layouts on a 256-bit + * block: 8 x `uint32_t` words, 8 fingerprint bits per key, fully horizontal add (Theta=8) and fully + * vertical contains (Phi=8). * - * @tparam Hash Hash function used to generate a key's fingerprint - * @tparam Word Underlying word/segment type of a filter block - * @tparam WordsPerBlock Number of words/segments in each block + * @tparam Key The key type to generate a fingerprint for. + * @tparam XXHash64 64-bit XXHash functor template. Defaults to `cuco::xxhash_64`. */ -template -class default_filter_policy { - using impl_type = cuco::detail::default_filter_policy_impl; - - public: - using hasher = typename impl_type::hasher; ///< Type of the hash function - using hash_argument_type = typename impl_type::hash_argument_type; ///< Hash function input type - using hash_result_type = typename impl_type::hash_result_type; ///< hash function output type - using word_type = - typename impl_type::word_type; ///< Underlying word/segment type of a filter block - - static constexpr std::uint32_t words_per_block = - impl_type::words_per_block; ///< Number of words/segments in each filter block - - public: - /** - * @brief Constructs the `default_filter_policy` object. - * - * @throws Compile-time error if the specified number of words in a filter block is not a - * power-of-two or is larger than 32. If called from host: throws exception; If called from - * device: Traps the kernel. - * - * @throws If the `hash_result_type` is too narrow to generate the requested number of - * `pattern_bits`. If called from host: throws exception; If called from device: Traps the kernel. - * - * @throws If `pattern_bits` is smaller than the number of words in a filter block or larger than - * the total number of bits in a filter block. If called from host: throws exception; If called - * from device: Traps the kernel. - * - * @param pattern_bits Number of bits in a key's fingerprint - * @param hash Hash function used to generate a key's fingerprint - */ - __host__ __device__ constexpr default_filter_policy(std::uint32_t pattern_bits = words_per_block, - Hash hash = {}); - - /** - * @brief Generates the hash value for a given key. - * - * @note This function is meant as a customization point and is only used in the internals of the - * `bloom_filter(_ref)` implementation. - * - * @param key The key to hash - * - * @return The hash value of the key - */ - __device__ constexpr hash_result_type hash(hash_argument_type const& key) const; - - /** - * @brief Determines the filter block a key is added into. - * - * @note This function is meant as a customization point and is only used in the internals of the - * `bloom_filter(_ref)` implementation. - * - * @tparam Extent Size type that is used to determine the number of blocks in the filter - * - * @param hash Hash value of the key - * @param num_blocks Number of block in the filter - * - * @return The block index for the given key's hash value - */ - template - __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const; - - /** - * @brief Determines the fingerprint pattern for a word/segment within the filter block for a - * given key's hash value. - * - * @note This function is meant as a customization point and is only used in the internals of the - * `bloom_filter(_ref)` implementation. - * - * @param hash Hash value of the key - * @param word_index Target word/segment within the filter block - * - * @return The bit pattern for the word/segment in the filter block - */ - __device__ constexpr word_type word_pattern(hash_result_type hash, - std::uint32_t word_index) const; - - private: - impl_type impl_; ///< Policy implementation -}; +template class XXHash64 = cuco::xxhash_64> +using default_filter_policy = + parametric_filter_policy, std::uint32_t, 8, 8, 8, 1, 1, 8>; } // namespace cuco - -#include \ No newline at end of file diff --git a/include/cuco/bloom_filter_ref.cuh b/include/cuco/bloom_filter_ref.cuh index f58910afc..bed199919 100644 --- a/include/cuco/bloom_filter_ref.cuh +++ b/include/cuco/bloom_filter_ref.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -63,10 +63,10 @@ class bloom_filter_ref { * @param scope The scope in which operations will be performed * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policies.cuh`) */ - __host__ __device__ explicit constexpr bloom_filter_ref(filter_block_type* data, - Extent num_blocks, - cuda_thread_scope scope, - Policy const& policy); + __host__ __device__ explicit bloom_filter_ref(filter_block_type* data, + Extent num_blocks, + cuda_thread_scope scope, + Policy const& policy); /** * @brief Constructs the ref object from existing storage. @@ -78,10 +78,10 @@ class bloom_filter_ref { * @param scope The scope in which operations will be performed * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policies.cuh`) */ - __host__ __device__ explicit constexpr bloom_filter_ref(word_type* data, - Extent num_blocks, - cuda_thread_scope scope, - Policy const& policy); + __host__ __device__ explicit bloom_filter_ref(word_type* data, + Extent num_blocks, + cuda_thread_scope scope, + Policy const& policy); /** * @brief Device function that cooperatively erases all information from the filter. @@ -178,8 +178,10 @@ class bloom_filter_ref { * @param stream CUDA stream used for device memory operations and kernel launches */ template - __host__ constexpr void add_async( - InputIt first, InputIt last, cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}}); + __host__ constexpr void add_async(InputIt first, + InputIt last, + cuda::stream_ref stream = cuda::stream_ref{ + cudaStream_t{nullptr}}) noexcept; /** * @brief Adds keys in the range `[first, last)` if `pred` of the corresponding `stencil` returns @@ -265,10 +267,21 @@ class bloom_filter_ref { template [[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const; - // TODO - // template - // __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin) - // const; + /** + * @brief Device function that tests if all keys in the range `[first, last)` are present in the + * filter. + * + * @tparam CG Cooperative Group type + * @tparam InputIt Device-accessible random access input key iterator + * @tparam OutputIt Device-accessible output iterator assignable from `bool` + * + * @param group The Cooperative Group this operation is executed with + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + */ + template + __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin) const; /** * @brief Tests all keys in the range `[first, last)` if their fingerprints are present in the diff --git a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh deleted file mode 100644 index 481293335..000000000 --- a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include -#include -#include - -#include -#include - -namespace cuco::detail { - -/** - * @brief A policy that defines how Arrow Block-Split Bloom Filter generates and stores a key's - * fingerprint. - * - * Reference: - * https://github.com/apache/arrow/blob/be1dcdb96b030639c0b56955c4c62f9d6b03f473/cpp/src/parquet/bloom_filter.cc#L219-L230 - * - * Example: - * @code{.cpp} - * template - * void bulk_insert_and_eval_arrow_policy_bloom_filter(device_vector const& positive_keys, - * device_vector const& negative_keys) - * { - * using policy_type = cuco::arrow_filter_policy; - * - * // Warn or throw if the number of filter blocks is greater than maximum used by Arrow policy. - * static_assert(NUM_FILTER_BLOCKS <= policy_type::max_filter_blocks, "NUM_FILTER_BLOCKS must be - * in range: [1, 4194304]"); - * - * // Create a bloom filter with Arrow policy - * cuco::bloom_filter, - * cuda::thread_scope_device, policy_type> filter{NUM_FILTER_BLOCKS}; - * - * // Add positive keys to the bloom filter - * filter.add(positive_keys.begin(), positive_keys.end()); - * - * auto const num_tp = positive_keys.size(); - * auto const num_tn = negative_keys.size(); - * - * // Vectors to store query results. - * thrust::device_vector true_positive_result(num_tp, false); - * thrust::device_vector true_negative_result(num_tn, false); - * - * // Query the bloom filter for the inserted keys. - * filter.contains(positive_keys.begin(), positive_keys.end(), true_positive_result.begin()); - * - * // We should see a true-positive rate of 1. - * float true_positive_rate = float(thrust::count(thrust::device, - * true_positive_result.begin(), true_positive_result.end(), true)) / float(num_tp); - * - * // Query the bloom filter for the non-inserted keys. - * filter.contains(negative_keys.begin(), negative_keys.end(), true_negative_result.begin()); - * - * // We may see a false-positive rate > 0 depending on the number of bits in the - * // filter and the number of hashes used per key. - * float false_positive_rate = float(thrust::count(thrust::device, - * true_negative_result.begin(), true_negative_result.end(), true)) / float(num_tn); - * } - * @endcode - * - * @tparam Key The type of the values to generate a fingerprint for. - * @tparam XXHash64 64-bit XXHash hasher implementation for fingerprint generation. - */ -template class XXHash64> -class arrow_filter_policy { - public: - using hasher = XXHash64; ///< 64-bit XXHash hasher for Arrow bloom filter policy - using word_type = std::uint32_t; ///< uint32_t for Arrow bloom filter policy - using key_type = Key; ///< Hash function input type - using hash_result_type = std::uint64_t; ///< hash function output type - - static constexpr uint32_t bits_set_per_block = 8; ///< hardcoded bits set per Arrow filter block - static constexpr uint32_t words_per_block = 8; ///< hardcoded words per Arrow filter block - - static constexpr std::uint32_t bytes_per_filter_block = - 32; ///< Number of bytes in one Arrow filter block - static constexpr std::uint32_t max_arrow_filter_bytes = - 128 * 1024 * 1024; ///< Max bytes in Arrow bloom filter - static constexpr std::uint32_t max_filter_blocks = - (max_arrow_filter_bytes / - bytes_per_filter_block); ///< Max sub-filter blocks allowed in Arrow bloom filter - - public: - /** - * @brief Constructs the `arrow_filter_policy` object. - * - * @note The number of filter blocks with Arrow policy must be in the - * range of [1, 4194304]. If the bloom filter is constructed with a larger - * number of blocks, only the first 4194304 (128MB) blocks will be used. - * - * @param hash Hash function used to generate a key's fingerprint - */ - __host__ __device__ constexpr arrow_filter_policy(hasher hash = {}) : hash_{hash} {} - - /** - * @brief Generates the hash value for a given key. - * - * @param key The key to hash - * - * @return The hash value of the key - */ - __device__ constexpr hash_result_type hash(key_type const& key) const { return hash_(key); } - - /** - * @brief Determines the filter block a key is added into. - * - * @note The number of filter blocks with Arrow policy must be in the - * range of [1, 4194304]. Passing a larger `num_blocks` will still - * upperbound the number of blocks used to the mentioned range. - * - * @tparam Extent Size type that is used to determine the number of blocks in the filter - * - * @param hash Hash value of the key - * @param num_blocks Number of block in the filter - * - * @return The block index for the given key's hash value - */ - template - __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const - { - constexpr auto hash_bits = cuda::std::numeric_limits::digits; - // TODO: assert if num_blocks > max_filter_blocks - auto const max_blocks = cuda::std::min(num_blocks, max_filter_blocks); - // Make sure we are only contained withing the `max_filter_blocks` blocks - return static_cast(((hash >> hash_bits) * max_blocks) >> hash_bits) % max_blocks; - } - - /** - * @brief Determines the fingerprint pattern for a word/segment within the filter block for a - * given key's hash value. - * - * @param hash Hash value of the key - * @param word_index Target word/segment within the filter block - * - * @return The bit pattern for the word/segment in the filter block - */ - __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const - { - word_type const key = static_cast(hash); - std::uint32_t salt{}; - - // Basically a switch (word_index) { case 0-7 ... } - // First split: 0..3 versus 4..7. - if (word_index < 4) { - // For indices 0..3, further split into 0..1 and 2..3. - if (word_index < 2) { - // word_index is 0 or 1. - salt = (word_index == 0) ? 0x47b6137bU : 0x44974d91U; - } else { - // word_index is 2 or 3. - salt = (word_index == 2) ? 0x8824ad5bU : 0xa2b7289dU; - } - } else { - // For indices 4..7, further split into 4..5 and 6..7. - if (word_index < 6) { - // word_index is 4 or 5. - salt = (word_index == 4) ? 0x705495c7U : 0x2df1424bU; - } else { - // word_index is 6 or 7. - salt = (word_index == 6) ? 0x9efc4947U : 0x5c6bfb31U; - } - } - return word_type{1} << ((key * salt) >> 27); - } - - private: - hasher hash_; -}; - -} // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/bloom_filter/bloom_filter.inl b/include/cuco/detail/bloom_filter/bloom_filter.inl index f3306b2fb..356e77b93 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter.inl +++ b/include/cuco/detail/bloom_filter/bloom_filter.inl @@ -27,15 +27,15 @@ namespace cuco { template -__host__ constexpr bloom_filter::bloom_filter( - Extent num_blocks, - cuda_thread_scope, - Policy const& policy, - Allocator const& alloc, - cuda::stream_ref stream) +__host__ bloom_filter::bloom_filter(Extent num_blocks, + cuda_thread_scope, + Policy const& policy, + Allocator const& alloc, + cuda::stream_ref stream) : allocator_{alloc}, - data_{allocator_.allocate(num_blocks, stream), - detail::custom_deleter{num_blocks, allocator_, stream}}, + data_{allocator_.allocate(static_cast(num_blocks), stream), + detail::custom_deleter{ + static_cast(num_blocks), allocator_, stream}}, ref_{data_.get(), num_blocks, {}, policy} { this->clear_async(stream); @@ -66,7 +66,7 @@ __host__ constexpr void bloom_filter::add template template __host__ constexpr void bloom_filter::add_async( - InputIt first, InputIt last, cuda::stream_ref stream) + InputIt first, InputIt last, cuda::stream_ref stream) noexcept { ref_.add_async(first, last, stream); } diff --git a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh index 8e2b49453..9b543900d 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh +++ b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -23,6 +24,7 @@ #include #include #include +#include #include #include @@ -44,204 +46,6 @@ namespace cuco::detail { -/** - * @brief Device functor for adding a single key to the bloom filter - * - * This functor is used with cuda::static_for to iterate over all words in a filter block - * and set the appropriate bits for a given key's hash value. Each iteration processes - * one word in the block using atomic operations to ensure thread safety. - * - * @tparam HashValue Type of the hash value (typically uint64_t) - * @tparam BlockIndex Type of the block index (typically size_t or uint32_t) - * @tparam Policy Filter policy type that provides word pattern generation - * @tparam WordType Underlying word type of the filter (typically uint64_t) - * @tparam Scope CUDA thread scope for atomic operations - */ -template -struct add_impl_functor { - HashValue hash_value; ///< Hash value of the key being added - BlockIndex block_index; ///< Index of the filter block to modify - Policy policy_; ///< Filter policy for generating bit patterns - WordType* words_; ///< Pointer to the filter's word array - size_t words_per_block; ///< Number of words in each filter block - - /** - * @brief Processes one word in the filter block for key insertion - * - * @tparam I Type of the integral constant passed by cuda::static_for - * @param i Integral constant representing the word index within the block - */ - template - __device__ void operator()(I i) const - { - auto const word = policy_.word_pattern(hash_value, i()); - if (word != 0) { - auto atom_word = - cuda::atomic_ref{*(words_ + (block_index * words_per_block + i()))}; - atom_word.fetch_or(word, cuda::memory_order_relaxed); - } - } -}; - -/** - * @brief Device functor for cooperative group-based batch key insertion - * - * This functor is used with cuda::static_for to process multiple keys in parallel - * within a cooperative group. Each thread in the group processes a different key - * using shuffle operations to share hash values and block indices across threads. - * - * @tparam CG Cooperative group type (e.g., thread_block_tile) - * @tparam HashValue Type of the hash value - * @tparam BlockIndex Type of the block index - * @tparam BloomFilterImpl Type of the bloom filter implementation - */ -template -struct add_group_functor { - CG group; ///< Cooperative group for parallel processing - HashValue hash_value; ///< Hash value of the current thread's key - BlockIndex block_index; ///< Block index of the current thread's key - size_t i; ///< Starting index in the key batch - size_t num_keys; ///< Total number of keys to process - size_t num_threads; ///< Number of threads in the group - BloomFilterImpl* self; ///< Pointer to the bloom filter implementation - - /** - * @brief Processes one thread's key in the cooperative group batch insertion - * - * @tparam J Type of the integral constant passed by cuda::static_for - * @param j Integral constant representing the thread index within the group - */ - template - __device__ void operator()(J j) const - { - if ((j() < num_threads) and (i + j() < num_keys)) { - self->add_impl(group, group.shfl(hash_value, j()), group.shfl(block_index, j())); - } - } -}; - -/** - * @brief Device functor for worker group-based batch key insertion - * - * This functor is used with cuda::static_for to process multiple keys in parallel - * within a worker group (subdivision of a larger cooperative group). Similar to - * add_group_functor but operates on a smaller worker group with offset handling - * for processing different portions of the key batch. - * - * @tparam WorkerGroup Worker group type (subdivision of cooperative group) - * @tparam HashValue Type of the hash value - * @tparam BlockIndex Type of the block index - * @tparam BloomFilterImpl Type of the bloom filter implementation - */ -template -struct add_worker_group_functor { - WorkerGroup worker_group; ///< Worker group (subdivision of cooperative group) - HashValue hash_value; ///< Hash value of the current thread's key - BlockIndex block_index; ///< Block index of the current thread's key - size_t i; ///< Starting index in the key batch - size_t worker_offset; ///< Offset for this worker group within the batch - size_t num_keys; ///< Total number of keys to process - size_t worker_num_threads; ///< Number of threads in the worker group - BloomFilterImpl* self; ///< Pointer to the bloom filter implementation - - /** - * @brief Processes one thread's key in the worker group batch insertion - * - * @tparam J Type of the integral constant passed by cuda::static_for - * @param j Integral constant representing the thread index within the worker group - */ - template - __device__ void operator()(J j) const - { - if ((j() < worker_num_threads) and (i + worker_offset + j() < num_keys)) { - self->add_impl( - worker_group, worker_group.shfl(hash_value, j()), worker_group.shfl(block_index, j())); - } - } -}; - -/** - * @brief Device functor for cooperative group-based single key insertion - * - * This functor is used with cuda::static_for to add a single key to the bloom filter - * using a cooperative group. Each thread in the group processes different words in - * the filter block based on thread rank and stride pattern. Used when the group size - * doesn't match the number of words per block. - * - * @tparam HashValue Type of the hash value - * @tparam BlockIndex Type of the block index - * @tparam WordType Underlying word type of the filter - * @tparam Scope CUDA thread scope for atomic operations - * @tparam Policy Filter policy type that provides word pattern generation - */ -template -struct add_impl_group_functor { - HashValue hash_value; ///< Hash value of the key being added - BlockIndex block_index; ///< Index of the filter block to modify - WordType* words_; ///< Pointer to the filter's word array - size_t words_per_block; ///< Number of words in each filter block - size_t rank; ///< Thread rank within the cooperative group - size_t num_threads; ///< Number of threads in the cooperative group - Policy policy_; ///< Filter policy for generating bit patterns - - /** - * @brief Processes one word in the filter block using cooperative group stride pattern - * - * @tparam I Type of the integral constant passed by cuda::static_for - * @param i Integral constant representing the word index within the block - */ - template - __device__ void operator()(I i) const - { - if (i() >= rank && (i() - rank) % num_threads == 0) { - auto atom_word = - cuda::atomic_ref{*(words_ + (block_index * words_per_block + i()))}; - atom_word.fetch_or(policy_.word_pattern(hash_value, i()), cuda::memory_order_relaxed); - } - } -}; - -/** - * @brief Device functor for checking if a key exists in the bloom filter - * - * This functor is used with cuda::static_for to iterate over all words in a filter block - * and check if the expected bit patterns for a given key's hash value are present. - * If any expected bit is missing, the result is set to false, indicating the key - * is definitely not in the set. - * - * @tparam HashValue Type of the hash value - * @tparam StoredPattern Type of the stored pattern array (typically array of WordType) - * @tparam Policy Filter policy type that provides word pattern generation - */ -template -struct contains_functor { - HashValue hash_value; ///< Hash value of the key being queried - StoredPattern stored_pattern; ///< Array of stored bit patterns from the filter block - Policy policy_; ///< Filter policy for generating expected bit patterns - bool* result; ///< Pointer to result flag (set to false if key not found) - - /** - * @brief Checks one word in the filter block for the expected bit pattern - * - * @tparam I Type of the integral constant passed by cuda::static_for - * @param i Integral constant representing the word index within the block - */ - template - __device__ void operator()(I i) const - { - auto const expected_pattern = policy_.word_pattern(hash_value, i()); - if ((stored_pattern[i()] & expected_pattern) != expected_pattern) { *result = false; } - } -}; - template class bloom_filter_impl { public: @@ -250,26 +54,47 @@ class bloom_filter_impl { using size_type = typename extent_type::value_type; using policy_type = Policy; using word_type = typename policy_type::word_type; + static_assert(sizeof(word_type) == 4 || sizeof(word_type) == 8, + "word_type must be 4 or 8 bytes wide for atomicOr"); + // atomicOr overloads resolve on canonical 32- and 64-bit unsigned integer types. + // Normalize by size so any policy-provided word_type (uint32_t, uint64_t, unsigned long, ...) + // resolves to a matching overload via the reinterpret_cast in atomic_or(). + using atomic_word_type = + cuda::std::conditional_t; + + // Implementation-tuning knobs. Not part of the public API; reached via + // `bloom_filter_impl::tuning::use_*` from internal kernels. Defaults reflect the ablation + // measurements from arXiv:2512.15595; flip in source for tuning experiments. + struct tuning { + static constexpr bool use_invoke_one = +#if defined(CUCO_HAS_CG_INVOKE_ONE) + true; +#else + false; // cg::invoke_one_broadcast requires CTK >= 12.1 +#endif + static constexpr bool use_early_exit = false; + static constexpr bool use_cub_kernels = true; + static constexpr bool use_warp_cooperative_add_kernel = true; + static constexpr bool use_warp_cooperative_contains_kernel = true; + static constexpr bool use_work_stealing_add_kernel = false; + static constexpr bool use_work_stealing_contains_kernel = false; + static constexpr bool use_cuda_atomic_ref = false; + }; static constexpr auto thread_scope = Scope; static constexpr auto words_per_block = policy_type::words_per_block; - __host__ __device__ static constexpr size_t max_vec_bytes() noexcept - { - constexpr auto word_bytes = sizeof(word_type); - constexpr auto block_bytes = word_bytes * words_per_block; - return cuda::std::min(cuda::std::max(word_bytes, 32ul), - block_bytes); // aiming for 2xLDG128 -> 1 sector per thread - } - - struct alignas(max_vec_bytes()) filter_block_type { - private: - word_type data_[words_per_block]; - }; + static constexpr auto add_vertical_layout = policy_type::add_vertical_layout; + static constexpr auto add_horizontal_layout = policy_type::add_horizontal_layout; + static constexpr auto contains_vertical_layout = policy_type::contains_vertical_layout; + static constexpr auto contains_horizontal_layout = policy_type::contains_horizontal_layout; + static constexpr auto add_loop_count = + words_per_block / (add_vertical_layout * add_horizontal_layout); + static constexpr auto contains_loop_count = + words_per_block / (contains_vertical_layout * contains_horizontal_layout); static_assert(cuda::std::has_single_bit(words_per_block) and words_per_block <= 32, "Number of words per block must be a power-of-two and less than or equal to 32"); - static_assert( cuda::std::is_constructible_v, word_type&> && cuda::std::is_invocable_r_v, "Invalid word type"); - __host__ __device__ explicit constexpr bloom_filter_impl(filter_block_type* filter, - Extent num_blocks, - cuda_thread_scope, - Policy policy) noexcept + __host__ __device__ static constexpr size_t alignment() noexcept + { + // Maximum alignment is 32 bytes which is equivalent to one sector + return cuda::std::min( + static_cast(32), + static_cast(cuda::std::max(add_vertical_layout, contains_vertical_layout) * + sizeof(word_type))); + } + + struct filter_block_type { + private: + alignas(alignment()) word_type data_[words_per_block]; + }; + + __host__ __device__ explicit bloom_filter_impl(filter_block_type* filter, + Extent num_blocks, + cuda_thread_scope, + Policy policy) : words_{reinterpret_cast(filter)}, num_blocks_{num_blocks}, policy_{policy} { + NV_IF_TARGET(NV_IS_HOST, (l2_cache_size_ = static_cast(cuco::detail::l2_cache_size());)) } - __host__ __device__ explicit constexpr bloom_filter_impl(word_type* filter, - Extent num_blocks, - cuda_thread_scope, - Policy policy) noexcept + __host__ __device__ explicit bloom_filter_impl(word_type* filter, + Extent num_blocks, + cuda_thread_scope, + Policy policy) : words_{filter}, num_blocks_{num_blocks}, policy_{policy} { + NV_IF_TARGET(NV_IS_HOST, (l2_cache_size_ = static_cast(cuco::detail::l2_cache_size());)) } template __device__ constexpr void clear(CG group) { - for (int i = group.thread_rank(); i < num_blocks_ * words_per_block; i += group.size()) { + // TODO optimize this + for (int i = group.thread_rank(); i < static_cast(num_blocks_) * words_per_block; + i += group.size()) { words_[i] = 0; } } @@ -317,151 +160,218 @@ class bloom_filter_impl { { CUCO_CUDA_TRY(cub::DeviceFor::ForEachN( words_, - num_blocks_ * words_per_block, + static_cast(num_blocks_) * words_per_block, [] __device__(word_type & word) { word = 0; }, stream.get())); } - template - __device__ void add(ProbeKey const& key) + __host__ constexpr void merge(bloom_filter_impl const& other, + cuda::stream_ref stream) { - auto const hash_value = policy_.hash(key); - this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_)); + this->merge_async(other, stream); +#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) + stream.sync(); +#else + stream.wait(); +#endif } - template - __device__ void add(InputIt first, InputIt last) + __host__ constexpr void merge_async(bloom_filter_impl const& other, + cuda::stream_ref stream) { - auto const num_keys = cuco::detail::distance(first, last); - for (decltype(num_keys) i = 0; i < num_keys; ++i) { - auto const hash_value = policy_.hash(*(first + i)); - this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_)); - } + CUCO_EXPECTS(this->block_extent() == other.block_extent(), + "mismatching num_blocks in merge_async"); + CUCO_CUDA_TRY(cub::DeviceTransform::Transform( + cuda::std::tuple{this->data(), other.data()}, + this->data(), + this->block_extent() * words_per_block, + [] __device__(word_type a, word_type b) { return a | b; }, + stream.get())); } - template - __device__ void add_impl(HashValue const& hash_value, BlockIndex block_index) + __host__ constexpr void intersect(bloom_filter_impl const& other, + cuda::stream_ref stream) { - add_impl_functor functor{ - hash_value, block_index, policy_, words_, words_per_block}; - cuda::static_for(functor); + this->intersect_async(other, stream); +#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) + stream.sync(); +#else + stream.wait(); +#endif } - template - __device__ void add(CG group, ProbeKey const& key) + __host__ constexpr void intersect_async( + bloom_filter_impl const& other, cuda::stream_ref stream) + { + CUCO_EXPECTS(this->block_extent() == other.block_extent(), + "mismatching num_blocks in intersect_async"); + CUCO_CUDA_TRY(cub::DeviceTransform::Transform( + cuda::std::tuple{this->data(), other.data()}, + this->data(), + this->block_extent() * words_per_block, + [] __device__(word_type a, word_type b) { return a & b; }, + stream.get())); + } + + [[nodiscard]] __host__ __device__ constexpr word_type* data() noexcept { return words_; } + + [[nodiscard]] __host__ __device__ constexpr word_type const* data() const noexcept { - constexpr auto num_threads = tile_size_v; - constexpr auto optimal_num_threads = add_optimal_cg_size(); - constexpr auto worker_num_threads = - (num_threads < optimal_num_threads) ? num_threads : optimal_num_threads; + return words_; + } - // If single thread is optimal, use scalar add - if constexpr (worker_num_threads == 1) { - this->add(key); + [[nodiscard]] __host__ __device__ constexpr extent_type block_extent() const noexcept + { + return num_blocks_; + } + + template + __device__ void add(BuildKey build_key) + { + auto const [upper_hash, lower_hash] = policy_.split_hash(build_key); + auto const block_index = policy_.block_index(upper_hash, num_blocks_); + + if constexpr (add_horizontal_layout == 1) { + add_pattern(block_index, lower_hash); } else { - auto const hash_value = policy_.hash(key); - this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_)); +#pragma unroll + for (uint32_t thread_index = 0; thread_index < add_horizontal_layout; ++thread_index) { + add_patterns(block_index, lower_hash, thread_index); + } } } - template - __device__ void add(CG group, InputIt first, InputIt last) + template + __device__ void add(CG group, BuildKey build_key) { namespace cg = cooperative_groups; - constexpr auto num_threads = tile_size_v; - constexpr auto optimal_num_threads = add_optimal_cg_size(); - constexpr auto worker_num_threads = - (num_threads < optimal_num_threads) ? num_threads : optimal_num_threads; - - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } + if constexpr (add_horizontal_layout == 1 || tile_size_v != add_horizontal_layout) { + // Tile size doesn't match the layout. Pick one lane to do the scalar (layout-agnostic) + // insert; the rest wait at the implicit sync. +#if defined(CUCO_HAS_CG_INVOKE_ONE) + cg::invoke_one(group, [&] __device__() { this->template add(build_key); }); +#else + if (group.thread_rank() == 0) { this->template add(build_key); } + group.sync(); +#endif + } else { + auto const [upper_hash, lower_hash, block_index] = [&] __device__ { +#if defined(CUCO_HAS_CG_INVOKE_ONE) + if constexpr (tuning::use_invoke_one) { + return cg::invoke_one_broadcast(group, [&] __device__() { + auto const sh = policy_.split_hash(build_key); + return cuda::std::make_tuple( + sh.first, sh.second, policy_.block_index(sh.first, num_blocks_)); + }); + } else +#endif + { + auto const sh = policy_.split_hash(build_key); + return cuda::std::make_tuple( + sh.first, sh.second, policy_.block_index(sh.first, num_blocks_)); + } + }(); - auto const rank = group.thread_rank(); + add_patterns(block_index, lower_hash, group.thread_rank()); + } + } - // If single thread is optimal, use scalar add - if constexpr (worker_num_threads == 1) { - for (auto i = rank; i < num_keys; i += num_threads) { - typename cuda::std::iterator_traits::value_type const& insert_element{ - *(first + i)}; - this->add(insert_element); - } - } else if constexpr (num_threads == worker_num_threads) { // given CG is optimal CG - typename policy_type::hash_result_type hash_value; - size_type block_index; - - auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads); - for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) { - if (i + rank < num_keys) { - typename cuda::std::iterator_traits::value_type const& insert_element{ - *(first + i + rank)}; - hash_value = policy_.hash(insert_element); - block_index = policy_.block_index(hash_value, num_blocks_); - } + template + __device__ void add_coop(CG group, BuildKey build_key) + { + constexpr auto num_threads = tile_size_v; - add_group_functor - functor{group, - hash_value, - block_index, - static_cast(i), - static_cast(num_keys), - static_cast(num_threads), - this}; - cuda::static_for(functor); - } - } else { // subdivide given CG into multiple optimal CGs - typename policy_type::hash_result_type hash_value; - size_type block_index; + auto const [upper_hash, lower_hash] = policy_.split_hash(build_key); + auto const block_index = policy_.block_index(upper_hash, num_blocks_); - auto const worker_group = cg::tiled_partition(group); - auto const worker_offset = worker_num_threads * worker_group.meta_group_rank(); +#pragma unroll num_threads + for (int i = 0; i < num_threads; ++i) { + add_patterns( + group.shfl(block_index, i), group.shfl(lower_hash, i), group.thread_rank()); + } + } - auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads); + template + __device__ void add_coop(CG group, InputIt first, Index idx, bool is_valid) + { + constexpr auto num_threads = tile_size_v; - for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) { - if (i + rank < num_keys) { - typename cuda::std::iterator_traits::value_type const& key{*(first + i + rank)}; - hash_value = policy_.hash(key); - block_index = policy_.block_index(hash_value, num_blocks_); - } + uint32_t upper_hash = 0; + uint32_t lower_hash = 0; + size_type block_index = 0; + if (is_valid) { + auto const& key = *(first + idx); + auto const sh = policy_.split_hash(key); + upper_hash = sh.first; + lower_hash = sh.second; + block_index = policy_.block_index(upper_hash, num_blocks_); + } - add_worker_group_functor - functor{worker_group, - hash_value, - block_index, - static_cast(i), - static_cast(worker_offset), - static_cast(num_keys), - static_cast(worker_num_threads), - this}; - cuda::static_for(functor); +#pragma unroll num_threads + for (int i = 0; i < num_threads; ++i) { + if (group.shfl(is_valid, i)) { + add_patterns( + group.shfl(block_index, i), group.shfl(lower_hash, i), group.thread_rank()); } } } - template - __device__ void add_impl(CG group, HashValue const& hash_value, BlockIndex block_index) + template + __device__ void add(CG group, InputIt first, InputIt last) { - constexpr auto num_threads = tile_size_v; + auto num_keys = cuco::detail::distance(first, last); + if constexpr (tile_size_v == add_horizontal_layout && add_horizontal_layout > 1) { + auto constexpr num_threads = static_cast(tile_size_v); + for (decltype(num_keys) batch = 0; batch < num_keys; batch += num_threads) { + auto const idx = batch + static_cast(group.thread_rank()); + auto const is_valid = idx < num_keys; + this->template add_coop(group, first, idx, is_valid); + } + } else { + auto const stride = static_cast(tile_size_v); + for (auto i = static_cast(group.thread_rank()); i < num_keys; + i += stride) { + this->add(*(first + i)); + } + } + } - auto const rank = group.thread_rank(); + template + __host__ void add_async(InputIt first, InputIt last, cuda::stream_ref stream) noexcept + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } - if constexpr (num_threads == words_per_block) { - auto atom_word = cuda::atomic_ref{ - *(words_ + (block_index * words_per_block + rank))}; - atom_word.fetch_or(policy_.word_pattern(hash_value, rank), cuda::memory_order_relaxed); + auto constexpr block_size = 256; + auto constexpr cg_size = static_cast(add_horizontal_layout); + auto const grid_size = tuning::use_warp_cooperative_add_kernel + ? cuco::detail::int_div_ceil(num_keys, block_size) + : cuco::detail::int_div_ceil(num_keys * cg_size, block_size); + auto const filter_size = static_cast(static_cast(num_blocks_)) * + words_per_block * sizeof(word_type); + + if (2 * filter_size < l2_cache_size_) { + if constexpr (tuning::use_work_stealing_add_kernel) { + detail::bloom_filter_ns::add_work_stealing_n + <<>>(first, num_keys, *this); + } else { + detail::bloom_filter_ns::add_n + <<>>(first, num_keys, *this); + } } else { - add_impl_group_functor functor{ - hash_value, block_index, words_, words_per_block, rank, num_threads, policy_}; - cuda::static_for(functor); + if constexpr (tuning::use_work_stealing_add_kernel) { + detail::bloom_filter_ns::add_work_stealing_n + <<>>(first, num_keys, *this); + } else { + detail::bloom_filter_ns::add_n + <<>>(first, num_keys, *this); + } } } template - __host__ constexpr void add(InputIt first, InputIt last, cuda::stream_ref stream) + __host__ void add(InputIt first, InputIt last, cuda::stream_ref stream) noexcept { this->add_async(first, last, stream); #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) @@ -471,125 +381,165 @@ class bloom_filter_impl { #endif } - template - __host__ constexpr void add_async(InputIt first, InputIt last, cuda::stream_ref stream) + template + __device__ bool contains(ProbeKey probe_key) const { - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } + auto const [upper_hash, lower_hash] = policy_.split_hash(probe_key); + auto const block_index = policy_.block_index(upper_hash, num_blocks_); - if constexpr (words_per_block == 1) { - CUCO_CUDA_TRY(cub::DeviceFor::ForEachCopyN( - first, - num_keys, - [*this] __device__(key_type const key) mutable { this->add(key); }, - stream.get())); + if constexpr (contains_horizontal_layout == 1) { + return compare_pattern<0>(block_index, lower_hash); } else { - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } - - auto constexpr block_size = cuco::detail::default_block_size(); - void const* kernel = reinterpret_cast( - detail::bloom_filter_ns::add); - auto const grid_size = cuco::detail::max_occupancy_grid_size(block_size, kernel); - - detail::bloom_filter_ns::add - <<>>(first, num_keys, *this); + bool result = true; +#pragma unroll + for (uint32_t thread_index = 0; thread_index < contains_horizontal_layout; ++thread_index) { + result = result && compare_patterns<0>(block_index, lower_hash, thread_index); + } + return result; } } - template - __host__ constexpr void add_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream) + template + __device__ bool contains(CG group, ProbeKey probe_key) const { - this->add_if_async(first, last, stencil, pred, stream); -#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) - stream.sync(); + namespace cg = cooperative_groups; + + if constexpr (contains_horizontal_layout == 1 || + tile_size_v != contains_horizontal_layout) { +#if defined(CUCO_HAS_CG_INVOKE_ONE) + return cg::invoke_one_broadcast( + group, [&] __device__() -> bool { return this->contains(probe_key); }); #else - stream.wait(); + // All lanes recompute the same deterministic scalar query. + return this->contains(probe_key); #endif + } else { + auto const [upper_hash, lower_hash, block_index] = [&] __device__ { +#if defined(CUCO_HAS_CG_INVOKE_ONE) + if constexpr (tuning::use_invoke_one) { + return cg::invoke_one_broadcast(group, [&] __device__() { + auto const sh = policy_.split_hash(probe_key); + return cuda::std::make_tuple( + sh.first, sh.second, policy_.block_index(sh.first, num_blocks_)); + }); + } else +#endif + { + auto const sh = policy_.split_hash(probe_key); + return cuda::std::make_tuple( + sh.first, sh.second, policy_.block_index(sh.first, num_blocks_)); + } + }(); + + return group.all(compare_patterns<0>(block_index, lower_hash, group.thread_rank())); + } } - template - __host__ constexpr void add_if_async(InputIt first, - InputIt last, - StencilIt stencil, - Predicate pred, - cuda::stream_ref stream) noexcept + template + __device__ bool contains_coop(CG group, ProbeKey probe_key) const { - auto const num_keys = cuco::detail::distance(first, last); - if (num_keys == 0) { return; } + constexpr auto num_threads = tile_size_v; - auto constexpr cg_size = add_optimal_cg_size(); - auto constexpr block_size = cuco::detail::default_block_size(); - auto const grid_size = - cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size); + auto const [upper_hash, lower_hash] = policy_.split_hash(probe_key); + auto const block_index = policy_.block_index(upper_hash, num_blocks_); + bool result_out = false; - detail::bloom_filter_ns::add_if_n - <<>>(first, num_keys, stencil, pred, *this); +#pragma unroll num_threads + for (int i = 0; i < num_threads; ++i) { + auto const result = group.all(compare_patterns<0>( + group.shfl(block_index, i), group.shfl(lower_hash, i), group.thread_rank())); + if (i == group.thread_rank()) { result_out = result; } + } + return result_out; } - template - [[nodiscard]] __device__ bool contains(ProbeKey const& key) const + template + __device__ bool contains_coop(CG group, InputIt first, Index idx, bool is_valid) const { - auto const hash_value = policy_.hash(key); - - auto const stored_pattern = this->vec_load_words( - policy_.block_index(hash_value, num_blocks_) * words_per_block); + constexpr auto num_threads = tile_size_v; - bool result = true; - contains_functor functor{ - hash_value, stored_pattern, policy_, &result}; - cuda::static_for(functor); - if (!result) { return false; } + uint32_t upper_hash = 0; + uint32_t lower_hash = 0; + size_type block_index = 0; + if (is_valid) { + auto const& key = *(first + idx); + auto const sh = policy_.split_hash(key); + upper_hash = sh.first; + lower_hash = sh.second; + block_index = policy_.block_index(upper_hash, num_blocks_); + } - return true; + bool result_out = false; +#pragma unroll num_threads + for (int i = 0; i < num_threads; ++i) { + if (group.shfl(is_valid, i)) { + auto const result = group.all(compare_patterns<0>( + group.shfl(block_index, i), group.shfl(lower_hash, i), group.thread_rank())); + if (i == group.thread_rank()) { result_out = result; } + } + } + return result_out; } - template - [[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const + template + __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin) const { - constexpr auto num_threads = tile_size_v; - constexpr auto optimal_num_threads = contains_optimal_cg_size(); - constexpr auto words_per_thread = words_per_block / optimal_num_threads; - - // If single thread is optimal, use scalar contains - if constexpr (num_threads == 1 or optimal_num_threads == 1) { - return this->contains(key); + auto num_keys = cuco::detail::distance(first, last); + if constexpr (tile_size_v == contains_horizontal_layout && contains_horizontal_layout > 1) { + auto constexpr num_threads = static_cast(tile_size_v); + for (decltype(num_keys) batch = 0; batch < num_keys; batch += num_threads) { + auto const idx = batch + static_cast(group.thread_rank()); + auto const is_valid = idx < num_keys; + auto const result = this->contains_coop(group, first, idx, is_valid); + if (is_valid) { *(output_begin + idx) = result; } + } } else { - auto const rank = group.thread_rank(); - auto const hash_value = policy_.hash(key); - bool success = true; + auto const stride = static_cast(tile_size_v); + for (auto i = static_cast(group.thread_rank()); i < num_keys; + i += stride) { + *(output_begin + i) = this->contains(*(first + i)); + } + } + } -// Use pragma unroll instead of cuda::static_for to avoid CUDA 12.0 compatibility issues -#pragma unroll - for (size_type i = 0; i < optimal_num_threads; ++i) { - if (i >= rank && (i - rank) % num_threads == 0) { - auto const thread_offset = i * words_per_thread; - auto const stored_pattern = this->vec_load_words( - policy_.block_index(hash_value, num_blocks_) * words_per_block + thread_offset); + template + __host__ void contains_async(InputIt first, + InputIt last, + OutputIt output_begin, + cuda::stream_ref stream) const noexcept + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } -#pragma unroll - for (size_type j = 0; j < words_per_thread; ++j) { - auto const expected_pattern = policy_.word_pattern(hash_value, thread_offset + j); - if ((stored_pattern[j] & expected_pattern) != expected_pattern) { success = false; } - } - } + if constexpr (tuning::use_cub_kernels and ((words_per_block / contains_vertical_layout) == 1)) { + cub::DeviceTransform::Transform( + first, + output_begin, + num_keys, + [*this] __device__(auto const& key) { return this->contains(key); }, + stream.get()); + } else { + auto constexpr block_size = 256; + auto constexpr cg_size = static_cast(contains_horizontal_layout); + auto const grid_size = tuning::use_warp_cooperative_contains_kernel + ? cuco::detail::int_div_ceil(num_keys, block_size) + : cuco::detail::int_div_ceil(num_keys * cg_size, block_size); + + if constexpr (tuning::use_work_stealing_contains_kernel) { + detail::bloom_filter_ns::contains_work_stealing_n + <<>>(first, num_keys, output_begin, *this); + } else { + detail::bloom_filter_ns::contains_n + <<>>(first, num_keys, output_begin, *this); } - - return group.all(success); } } - // TODO - // template - // __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin) - // const; - template __host__ void contains(InputIt first, InputIt last, OutputIt output_begin, - cuda::stream_ref stream) const + cuda::stream_ref stream) const noexcept { this->contains_async(first, last, output_begin, stream); #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) @@ -599,25 +549,41 @@ class bloom_filter_impl { #endif } - template - __host__ void contains_async(InputIt first, - InputIt last, - OutputIt output_begin, - cuda::stream_ref stream) const noexcept + template + __host__ void add_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda::stream_ref stream) noexcept { - auto const always_true = cuda::constant_iterator{true}; - this->contains_if_async(first, last, always_true, cuda::std::identity{}, output_begin, stream); + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto constexpr block_size = 256; + auto constexpr cg_size = static_cast(add_horizontal_layout); + auto const grid_size = tuning::use_warp_cooperative_add_kernel + ? cuco::detail::int_div_ceil(num_keys, block_size) + : cuco::detail::int_div_ceil(num_keys * cg_size, block_size); + auto const filter_size = static_cast(static_cast(num_blocks_)) * + words_per_block * sizeof(word_type); + + if (2 * filter_size < l2_cache_size_) { + detail::bloom_filter_ns::add_if_n + <<>>(first, num_keys, stencil, pred, *this); + } else { + detail::bloom_filter_ns::add_if_n + <<>>(first, num_keys, stencil, pred, *this); + } } - template - __host__ void contains_if(InputIt first, - InputIt last, - StencilIt stencil, - Predicate pred, - OutputIt output_begin, - cuda::stream_ref stream) const + template + __host__ void add_if(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda::stream_ref stream) noexcept { - this->contains_if_async(first, last, stencil, pred, output_begin, stream); + this->add_if_async(first, last, stencil, pred, stream); #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) stream.sync(); #else @@ -636,20 +602,26 @@ class bloom_filter_impl { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } - auto constexpr cg_size = contains_optimal_cg_size(); - auto constexpr block_size = cuco::detail::default_block_size(); - auto const grid_size = - cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size); + auto constexpr block_size = 256; + auto constexpr cg_size = static_cast(contains_horizontal_layout); + auto const grid_size = tuning::use_warp_cooperative_contains_kernel + ? cuco::detail::int_div_ceil(num_keys, block_size) + : cuco::detail::int_div_ceil(num_keys * cg_size, block_size); detail::bloom_filter_ns::contains_if_n <<>>( first, num_keys, stencil, pred, output_begin, *this); } - __host__ constexpr void merge(bloom_filter_impl const& other, - cuda::stream_ref stream) + template + __host__ void contains_if(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda::stream_ref stream) const noexcept { - this->merge_async(other, stream); + this->contains_if_async(first, last, stencil, pred, output_begin, stream); #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) stream.sync(); #else @@ -657,86 +629,174 @@ class bloom_filter_impl { #endif } - __host__ constexpr void merge_async(bloom_filter_impl const& other, - cuda::stream_ref stream) - { - CUCO_EXPECTS(this->block_extent() == other.block_extent(), - "mismatching num_blocks in merge_async"); - CUCO_CUDA_TRY(cub::DeviceTransform::Transform( - cuda::std::tuple{this->data(), other.data()}, - this->data(), - this->block_extent() * words_per_block, - [] __device__(word_type a, word_type b) { return a | b; }, - stream.get())); - } + // TODO + // [[nodiscard]] __host__ double occupancy() const; + // [[nodiscard]] __host__ double expected_false_positive_rate(size_t unique_keys) const + // [[nodiscard]] __host__ __device__ static uint32_t optimal_pattern_bits(size_t num_blocks) + // template + // [[nodiscard]] __device__ constexpr auto make_copy(CG group, word_type* const + // memory_to_use, cuda_thread_scope scope = {}) const noexcept; - __host__ constexpr void intersect(bloom_filter_impl const& other, - cuda::stream_ref stream) + // private: + template + __device__ constexpr cuda::std::array vec_load_words(size_type index) const { - this->intersect_async(other, stream); -#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1) - stream.sync(); -#else - stream.wait(); -#endif + // The block storage is aligned to `alignment()`, but a per-lane load at offset `index` is + // only guaranteed to be aligned to `min(NumWords * sizeof(word_type), alignment())`. Hand the + // compiler the alignment that's actually delivered, not the block-level maximum. + constexpr auto load_alignment = + cuda::std::min(NumWords * sizeof(word_type), alignment()); + return *reinterpret_cast*>( + __builtin_assume_aligned(words_ + index, load_alignment)); } - __host__ constexpr void intersect_async( - bloom_filter_impl const& other, cuda::stream_ref stream) + template + __device__ constexpr void add_pattern(uint32_t block_index, uint32_t lower_hash) { - CUCO_EXPECTS(this->block_extent() == other.block_extent(), - "mismatching num_blocks in intersect_async"); - CUCO_CUDA_TRY(cub::DeviceTransform::Transform( - cuda::std::tuple{this->data(), other.data()}, - this->data(), - this->block_extent() * words_per_block, - [] __device__(word_type a, word_type b) { return a & b; }, - stream.get())); - } + static_assert(add_horizontal_layout == 1, "add_pattern() requires add_horizontal_layout == 1"); - [[nodiscard]] __host__ __device__ constexpr word_type* data() noexcept { return words_; } + if constexpr (LoopIndex < add_loop_count) { + auto const pattern = + policy_.template array_pattern(lower_hash); + auto* word_base = words_ + block_index * words_per_block + LoopIndex * add_vertical_layout; - [[nodiscard]] __host__ __device__ constexpr word_type const* data() const noexcept - { - return words_; + for (int i = 0; i < add_vertical_layout; ++i) { + atomic_or(word_base + i, pattern[i]); + } + + // Recurse. + add_pattern(block_index, lower_hash); + } } - [[nodiscard]] __host__ __device__ constexpr extent_type block_extent() const noexcept + template + __device__ constexpr void add_patterns(uint32_t block_index, + uint32_t lower_hash, + uint32_t thread_index) { - return num_blocks_; + static_assert(add_horizontal_layout > 1, "add_patterns() requires add_horizontal_layout > 1"); + + if constexpr (LoopIndex < add_loop_count) { + auto const pattern = + policy_.template array_pattern( + lower_hash, thread_index); + auto* word_base = words_ + block_index * words_per_block + + LoopIndex * add_vertical_layout * add_horizontal_layout + + thread_index * add_vertical_layout; + + for (int i = 0; i < add_vertical_layout; ++i) { + atomic_or(word_base + i, pattern[i]); + } + + // Recurse. + add_patterns(block_index, lower_hash, thread_index); + } } - // TODO - // [[nodiscard]] __host__ double occupancy() const; - // [[nodiscard]] __host__ double expected_false_positive_rate(size_t unique_keys) const - // [[nodiscard]] __host__ __device__ static uint32_t optimal_pattern_bits(size_t num_blocks) - // template - // [[nodiscard]] __device__ constexpr auto make_copy(CG group, word_type* const - // memory_to_use, cuda_thread_scope scope = {}) const noexcept; + template + __device__ constexpr void atomic_or(word_type* word_ptr, word_type pattern) const + { + auto const do_or = [&]() { + if constexpr (tuning::use_cuda_atomic_ref) { + cuda::atomic_ref{*word_ptr}.fetch_or(pattern, + cuda::memory_order_relaxed); + } else { + auto* const p = reinterpret_cast(word_ptr); + auto const v = static_cast(pattern); + if constexpr (thread_scope == cuda::thread_scope_thread) { + *p |= v; + } else if constexpr (thread_scope == cuda::thread_scope_block) { + atomicOr_block(p, v); + } else if constexpr (thread_scope == cuda::thread_scope_device) { + atomicOr(p, v); + } else if constexpr (thread_scope == cuda::thread_scope_system) { + atomicOr_system(p, v); + } else { + static_assert(cuco::dependent_false, + "unsupported cuda::thread_scope for native atomic_or"); + } + } + }; - private: - template - __device__ constexpr cuda::std::array vec_load_words(size_type index) const - { - return *reinterpret_cast*>(__builtin_assume_aligned( - words_ + index, cuda::std::min(sizeof(word_type) * NumWords, max_vec_bytes()))); + if constexpr (ConditionalAtomic) { + if ((*word_ptr & pattern) != pattern) { do_or(); } + } else { + do_or(); + } } - [[nodiscard]] __host__ __device__ static constexpr int32_t add_optimal_cg_size() + /// Compare the stored pattern against the expected pattern for the given hash value. + // Precondition: contains_horizontal_layout == 1 + template + __device__ constexpr bool compare_pattern(uint32_t block_index, uint32_t lower_hash) const { - return words_per_block; // one thread per word so atomic updates can be coalesced + static_assert(contains_horizontal_layout == 1, + "compare_pattern() requires contains_horizontal_layout == 1"); + + if constexpr (LoopIndex < contains_loop_count) { + auto const stored_pattern = this->vec_load_words( + block_index * words_per_block + LoopIndex * contains_vertical_layout); + auto const expected_pattern = + policy_.template array_pattern(lower_hash); + + bool match = true; + for (int i = 0; i < contains_vertical_layout; ++i) { + match &= (stored_pattern[i] & expected_pattern[i]) == expected_pattern[i]; + } + + // Recurse. + // Early exit in this implementation occurs at the granulairy of contains_vertical_layout + // words. + if constexpr (tuning::use_early_exit) { + if (!match) { return false; } + return compare_pattern(block_index, lower_hash); + } else { + return compare_pattern(block_index, lower_hash) && match; + } + } else { + return true; + } } - [[nodiscard]] __host__ __device__ static constexpr int32_t contains_optimal_cg_size() + template + __device__ constexpr bool compare_patterns(uint32_t block_index, + uint32_t lower_hash, + uint32_t thread_index) const { - constexpr auto word_bytes = sizeof(word_type); - constexpr auto block_bytes = word_bytes * words_per_block; - return block_bytes / max_vec_bytes(); // one vector load per thread + static_assert(contains_horizontal_layout > 1, + "compare_patterns() requires HorizontalLayout > 1"); + + if constexpr (LoopIndex < contains_loop_count) { + auto const stored_pattern = this->vec_load_words( + block_index * words_per_block + + LoopIndex * contains_vertical_layout * contains_horizontal_layout + + thread_index * contains_vertical_layout); + auto const expected_pattern = + policy_ + .template array_pattern( + lower_hash, thread_index); + + bool match = true; + for (int i = 0; i < contains_vertical_layout; ++i) { + match &= (stored_pattern[i] & expected_pattern[i]) == expected_pattern[i]; + } + + // Per-thread early exit: short-circuit this thread's recursion if its slice already missed. + if constexpr (tuning::use_early_exit) { + if (!match) { return false; } + return compare_patterns(block_index, lower_hash, thread_index); + } else { + return compare_patterns(block_index, lower_hash, thread_index) && match; + } + } else { + return true; + } } word_type* words_; extent_type num_blocks_; policy_type policy_; + size_t l2_cache_size_ = 0; }; } // namespace cuco::detail diff --git a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl index b251dceee..df95af888 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl +++ b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,14 +24,14 @@ namespace cuco { template -__host__ __device__ constexpr bloom_filter_ref::bloom_filter_ref( +__host__ __device__ bloom_filter_ref::bloom_filter_ref( filter_block_type* data, Extent num_blocks, cuda_thread_scope, Policy const& policy) : impl_{data, num_blocks, {}, policy} { } template -__host__ __device__ constexpr bloom_filter_ref::bloom_filter_ref( +__host__ __device__ bloom_filter_ref::bloom_filter_ref( word_type* data, Extent num_blocks, cuda_thread_scope, Policy const& policy) : impl_{data, num_blocks, {}, policy} { @@ -92,7 +92,7 @@ __host__ constexpr void bloom_filter_ref::add(InputI template template __host__ constexpr void bloom_filter_ref::add_async( - InputIt first, InputIt last, cuda::stream_ref stream) + InputIt first, InputIt last, cuda::stream_ref stream) noexcept { impl_.add_async(first, last, stream); } @@ -129,6 +129,16 @@ template return impl_.contains(group, key); } +template +template +__device__ void bloom_filter_ref::contains(CG group, + InputIt first, + InputIt last, + OutputIt output_begin) const +{ + impl_.contains(group, first, last, output_begin); +} + template template __host__ constexpr void bloom_filter_ref::contains( diff --git a/include/cuco/detail/bloom_filter/default_filter_policy.inl b/include/cuco/detail/bloom_filter/default_filter_policy.inl deleted file mode 100644 index eb8dbf703..000000000 --- a/include/cuco/detail/bloom_filter/default_filter_policy.inl +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -namespace cuco { - -template -__host__ - __device__ constexpr default_filter_policy::default_filter_policy( - uint32_t pattern_bits, Hash hash) - : impl_{pattern_bits, hash} -{ -} - -template -__device__ constexpr typename default_filter_policy::hash_result_type -default_filter_policy::hash( - typename default_filter_policy::hash_argument_type const& key) const -{ - return impl_.hash(key); -} - -template -template -__device__ constexpr auto default_filter_policy::block_index( - typename default_filter_policy::hash_result_type hash, - Extent num_blocks) const -{ - return impl_.block_index(hash, num_blocks); -} - -template -__device__ constexpr typename default_filter_policy::word_type -default_filter_policy::word_pattern( - default_filter_policy::hash_result_type hash, - std::uint32_t word_index) const -{ - return impl_.word_pattern(hash, word_index); -} - -} // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh b/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh deleted file mode 100644 index 14509b9b0..000000000 --- a/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include -#include -#include -#include - -#include -#include - -namespace cuco::detail { - -template -class default_filter_policy_impl { - public: - using hasher = Hash; - using word_type = Word; - using hash_argument_type = typename hasher::argument_type; - using hash_result_type = decltype(std::declval()(std::declval())); - - static constexpr std::uint32_t words_per_block = WordsPerBlock; - - private: - static constexpr std::uint32_t word_bits = cuda::std::numeric_limits::digits; - static constexpr std::uint32_t bit_index_width = cuda::std::bit_width(word_bits - 1); - - public: - __host__ __device__ explicit constexpr default_filter_policy_impl(uint32_t pattern_bits, - Hash hash) - : pattern_bits_{pattern_bits}, - min_bits_per_word_{pattern_bits_ / words_per_block}, - remainder_bits_{pattern_bits_ % words_per_block}, - hash_{hash} - { - NV_DISPATCH_TARGET( - NV_IS_HOST, - ( // This ensures each word in the block has at least one bit set; otherwise we would never - // use some of the words - constexpr uint32_t min_pattern_bits = words_per_block; - - // The maximum number of bits to be set for a key is capped by the total number of bits in - // the filter block - constexpr uint32_t max_pattern_bits = word_bits * words_per_block; - - constexpr uint32_t hash_bits = cuda::std::numeric_limits::digits; - constexpr uint32_t max_pattern_bits_from_hash = hash_bits / bit_index_width; - CUCO_EXPECTS( - pattern_bits <= max_pattern_bits_from_hash, - "`hash_result_type` too narrow to generate the requested number of `pattern_bits`"); - CUCO_EXPECTS(pattern_bits_ >= min_pattern_bits, - "`pattern_bits` must be at least `words_per_block`"); - CUCO_EXPECTS(pattern_bits_ <= max_pattern_bits, - "`pattern_bits` must be less than the total number of bits in a filter " - "block");)) - // TODO find a proper way to perform input checks/assertions on device without destroying the - // context (e.g. __trap()) - } - - __device__ constexpr hash_result_type hash(hash_argument_type const& key) const - { - return hash_(key); - } - - template - __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const - { - return hash % num_blocks; - } - - __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const - { - word_type constexpr bit_index_mask = (word_type{1} << bit_index_width) - 1; - - auto const bits_so_far = min_bits_per_word_ * word_index + - (word_index < remainder_bits_ ? word_index : remainder_bits_); - - hash >>= bits_so_far * bit_index_width; - - word_type word = 0; - int32_t const bits_per_word = min_bits_per_word_ + (word_index < remainder_bits_ ? 1 : 0); - - for (int32_t bit = 0; bit < bits_per_word; ++bit) { - word |= word_type{1} << (hash & bit_index_mask); - hash >>= bit_index_width; - } - - return word; - } - - private: - uint32_t pattern_bits_; - uint32_t min_bits_per_word_; - uint32_t remainder_bits_; - hasher hash_; -}; - -} // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/bloom_filter/kernels.cuh b/include/cuco/detail/bloom_filter/kernels.cuh index 6c918fb7d..d4264a2b1 100644 --- a/include/cuco/detail/bloom_filter/kernels.cuh +++ b/include/cuco/detail/bloom_filter/kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,42 +15,325 @@ */ #pragma once +#include #include #include +#include #include #include #include +#include namespace cuco::detail::bloom_filter_ns { CUCO_SUPPRESS_KERNEL_WARNINGS +template +__device__ void add_n_impl(InputIt first, cuco::detail::index_type n, Ref ref) +{ + namespace cg = cooperative_groups; + using key_type = typename cuda::std::iterator_traits::value_type; + + // Only use warp-cooperative kernels when CGSize > 1 + if constexpr (Ref::tuning::use_warp_cooperative_add_kernel && CGSize > 1) { + auto const idx = cuco::detail::global_thread_id(); + auto group = cg::tiled_partition(cg::this_thread_block()); + auto const is_full_tile = (blockIdx.x + 1) * BlockSize <= n; + if (is_full_tile) { + key_type const& key = *(first + idx); + ref.add_coop(group, key); + } else { + auto const is_valid = idx < n; + ref.add_coop(group, first, idx, is_valid); + } + } else { + auto const idx = cuco::detail::global_thread_id() / CGSize; + if constexpr (CGSize == 1) { + if (idx < n) { + key_type const& key = *(first + idx); + ref.add(key); + } + } else { + auto group = cg::tiled_partition(cg::this_thread_block()); + if (idx < n) { + key_type const& key = *(first + idx); + ref.add(group, key); + } + } + } +} + +template +CUCO_KERNEL __launch_bounds__(BlockSize) void add_n(InputIt first, + cuco::detail::index_type n, + Ref ref) +{ + add_n_impl(first, n, ref); +} -template -CUCO_KERNEL __launch_bounds__(BlockSize) void add(InputIt first, - cuco::detail::index_type n, - Ref ref) +template +__device__ void add_work_stealing_n_impl(InputIt first, cuco::detail::index_type n, Ref ref) { - namespace cg = cooperative_groups; +#if __cccl_ptx_isa >= 860 + using key_type = typename cuda::std::iterator_traits::value_type; + + namespace cg = cooperative_groups; + namespace ptx = cuda::ptx; + + // Cluster launch control initialization: + __shared__ uint4 result; + __shared__ uint64_t bar; + int phase = 0; - constexpr auto tile_size = cuco::detail::warp_size(); + auto const block = cg::this_thread_block(); - auto const tile_idx = cuco::detail::global_thread_id() / tile_size; - auto const n_tiles = gridDim.x * BlockSize / tile_size; - auto const items_per_tile = cuco::detail::int_div_ceil(n, n_tiles); + cg::invoke_one(block, [&]() { ptx::mbarrier_init(&bar, 1); }); - auto const tile_start = tile_idx * items_per_tile; - if (tile_start >= n) { return; } - auto const tile_stop = (tile_start + items_per_tile < n) ? tile_start + items_per_tile : n; + int bx = blockIdx.x; - auto const tile = cg::tiled_partition(cg::this_thread_block()); + // Work-stealing loop: + while (true) { + // Protect result from overwrite in the next iteration, + // (also ensure barrier initialization at 1st iteration): + block.sync(); + + cg::invoke_one(block, [&]() { + // Acquire write of result in the async proxy: + ptx::fence_proxy_async_generic_sync_restrict( + ptx::sem_acquire, ptx::space_cluster, ptx::scope_cluster); + + cg::invoke_one(cg::coalesced_threads(), + [&]() { ptx::clusterlaunchcontrol_try_cancel(&result, &bar); }); + ptx::mbarrier_arrive_expect_tx( + ptx::sem_relaxed, ptx::scope_cta, ptx::space_shared, &bar, sizeof(uint4)); + }); + + // Computation: + // Only use warp-cooperative kernels when CGSize > 1 + if constexpr (Ref::tuning::use_warp_cooperative_add_kernel && CGSize > 1) { + cuco::detail::index_type const idx = BlockSize * bx + threadIdx.x; + auto group = cg::tiled_partition(block); + auto const is_full_tile = (bx + 1) * BlockSize <= n; + if (is_full_tile) { + key_type const& key = *(first + idx); + ref.add_coop(group, key); + } else { + auto const is_valid = idx < n; + ref.add_coop(group, first, idx, is_valid); + } + } else { + cuco::detail::index_type const idx = + (static_cast(BlockSize) * bx + threadIdx.x) / CGSize; + if constexpr (CGSize == 1) { + if (idx < n) { + key_type const& key = *(first + idx); + ref.add(key); + } + } else { + auto group = cg::tiled_partition(block); + if (idx < n) { + key_type const& key = *(first + idx); + ref.add(group, key); + } + } + } - ref.add(tile, first + tile_start, first + tile_stop); + // Cancellation request synchronization: + while (!ptx::mbarrier_try_wait_parity(ptx::sem_acquire, ptx::scope_cta, &bar, phase)) {} + phase ^= 1; + + // Cancellation request decoding: + bool success = ptx::clusterlaunchcontrol_query_cancel_is_canceled(result); + if (!success) break; + + bx = ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_x(result); + + // Release read of result to the async proxy: + ptx::fence_proxy_async_generic_sync_restrict( + ptx::sem_release, ptx::space_shared, ptx::scope_cluster); + } +#else + // Cluster launch control PTX intrinsics require PTX ISA >= 8.6 (CTK 12.8+). + // On older toolkits, fall back to the regular kernel body. The runtime path is + // also guarded by `NV_PROVIDES_SM_100`, so this branch is unreachable when SM100 + // is targeted but the toolkit is too old to expose the intrinsics. + add_n_impl(first, n, ref); +#endif } -template +CUCO_KERNEL __launch_bounds__(BlockSize) void add_work_stealing_n(InputIt first, + cuco::detail::index_type n, + Ref ref) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_100, + (add_work_stealing_n_impl(first, n, ref);), + (add_n_impl(first, n, ref);)) +} + +template +__device__ void contains_n_impl(InputIt first, + cuco::detail::index_type n, + OutputIt output_begin, + Ref ref) +{ + namespace cg = cooperative_groups; + using key_type = typename cuda::std::iterator_traits::value_type; + + // Only use warp-cooperative kernels when CGSize > 1 + if constexpr (Ref::tuning::use_warp_cooperative_contains_kernel && CGSize > 1) { + auto const idx = cuco::detail::global_thread_id(); + auto group = cg::tiled_partition(cg::this_thread_block()); + auto const is_full_tile = (blockIdx.x + 1) * BlockSize <= n; + if (is_full_tile) { + key_type const& key = *(first + idx); + *(output_begin + idx) = ref.contains_coop(group, key); + } else { + auto const is_valid = idx < n; + auto const result = ref.contains_coop(group, first, idx, is_valid); + if (is_valid) { *(output_begin + idx) = result; } + } + } else { + auto idx = cuco::detail::global_thread_id() / CGSize; + if constexpr (CGSize == 1) { + if (idx < n) { + key_type const& key = *(first + idx); + *(output_begin + idx) = ref.contains(key); + } + } else { + auto group = cg::tiled_partition(cg::this_thread_block()); + if (idx < n) { + key_type const& key = *(first + idx); + // ref.contains(group, key) already reduces across the group via group.all(...). + auto const found = ref.contains(group, key); + if (group.thread_rank() == 0) { *(output_begin + idx) = found; } + } + } + } +} + +template +CUCO_KERNEL __launch_bounds__(BlockSize) void contains_n(InputIt first, + cuco::detail::index_type n, + OutputIt output_begin, + Ref ref) +{ + contains_n_impl(first, n, output_begin, ref); +} + +template +__device__ void contains_work_stealing_n_impl(InputIt first, + cuco::detail::index_type n, + OutputIt output_begin, + Ref ref) +{ +#if __cccl_ptx_isa >= 860 + using key_type = typename cuda::std::iterator_traits::value_type; + + namespace cg = cooperative_groups; + namespace ptx = cuda::ptx; + + // Cluster launch control initialization: + __shared__ uint4 result; + __shared__ uint64_t bar; + int phase = 0; + + auto const block = cg::this_thread_block(); + + cg::invoke_one(block, [&]() { ptx::mbarrier_init(&bar, 1); }); + + int bx = blockIdx.x; + + // Work-stealing loop: + while (true) { + // Protect result from overwrite in the next iteration, + // (also ensure barrier initialization at 1st iteration): + block.sync(); + + cg::invoke_one(block, [&]() { + // Acquire write of result in the async proxy: + ptx::fence_proxy_async_generic_sync_restrict( + ptx::sem_acquire, ptx::space_cluster, ptx::scope_cluster); + + cg::invoke_one(cg::coalesced_threads(), + [&]() { ptx::clusterlaunchcontrol_try_cancel(&result, &bar); }); + ptx::mbarrier_arrive_expect_tx( + ptx::sem_relaxed, ptx::scope_cta, ptx::space_shared, &bar, sizeof(uint4)); + }); + + // Computation: + // Only use warp-cooperative kernels when CGSize > 1 + if constexpr (Ref::tuning::use_warp_cooperative_contains_kernel && CGSize > 1) { + cuco::detail::index_type const idx = BlockSize * bx + threadIdx.x; + auto group = cg::tiled_partition(block); + auto const is_full_tile = (bx + 1) * BlockSize <= n; + if (is_full_tile) { + key_type const& key = *(first + idx); + *(output_begin + idx) = ref.contains_coop(group, key); + } else { + auto const is_valid = idx < n; + auto const result = ref.contains_coop(group, first, idx, is_valid); + if (is_valid) { *(output_begin + idx) = result; } + } + } else { + cuco::detail::index_type const idx = + (static_cast(BlockSize) * bx + threadIdx.x) / CGSize; + if constexpr (CGSize == 1) { + if (idx < n) { + key_type const& key = *(first + idx); + *(output_begin + idx) = ref.contains(key); + } + } else { + auto group = cg::tiled_partition(block); + if (idx < n) { + key_type const& key = *(first + idx); + // ref.contains(group, key) already reduces across the group via group.all(...). + auto const found = ref.contains(group, key); + if (group.thread_rank() == 0) { *(output_begin + idx) = found; } + } + } + } + + // Cancellation request synchronization: + while (!ptx::mbarrier_try_wait_parity(ptx::sem_acquire, ptx::scope_cta, &bar, phase)) {} + phase ^= 1; + + // Cancellation request decoding: + bool success = ptx::clusterlaunchcontrol_query_cancel_is_canceled(result); + if (!success) break; + + bx = ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_x(result); + + // Release read of result to the async proxy: + ptx::fence_proxy_async_generic_sync_restrict( + ptx::sem_release, ptx::space_shared, ptx::scope_cluster); + } +#else + // Cluster launch control PTX intrinsics require PTX ISA >= 8.6 (CTK 12.8+). + // On older toolkits, fall back to the regular kernel body. The runtime path is + // also guarded by `NV_PROVIDES_SM_100`, so this branch is unreachable when SM100 + // is targeted but the toolkit is too old to expose the intrinsics. + contains_n_impl(first, n, output_begin, ref); +#endif +} + +template +CUCO_KERNEL __launch_bounds__(BlockSize) void contains_work_stealing_n(InputIt first, + cuco::detail::index_type n, + OutputIt output_begin, + Ref ref) +{ + NV_IF_ELSE_TARGET( + NV_PROVIDES_SM_100, + (contains_work_stealing_n_impl(first, n, output_begin, ref);), + (contains_n_impl(first, n, output_begin, ref);)) +} + +template ::value_type; - [[maybe_unused]] auto const tile = - cg::tiled_partition(cg::this_thread_block()); - - while (idx < n) { - if (pred(*(stencil + idx))) { - typename cuda::std::iterator_traits::value_type const& insert_element{ - *(first + idx)}; - ref.add(tile, insert_element); + if constexpr (Ref::tuning::use_warp_cooperative_add_kernel && CGSize > 1) { + auto const idx = cuco::detail::global_thread_id(); + auto group = cg::tiled_partition(cg::this_thread_block()); + auto const in_range = idx < n; + auto const is_valid = in_range && pred(*(stencil + idx)); + ref.template add_coop(group, first, idx, is_valid); + } else { + auto const idx = cuco::detail::global_thread_id() / CGSize; + if (idx < n && pred(*(stencil + idx))) { + key_type const& key = *(first + idx); + if constexpr (CGSize == 1) { + ref.template add(key); + } else { + auto group = cg::tiled_partition(cg::this_thread_block()); + ref.template add(group, key); + } } - idx += loop_stride; } } @@ -88,30 +376,39 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, - OutputIt out, + OutputIt output_begin, Ref ref) { - namespace cg = cooperative_groups; + namespace cg = cooperative_groups; + using key_type = typename cuda::std::iterator_traits::value_type; - auto const loop_stride = cuco::detail::grid_stride() / CGSize; - auto idx = cuco::detail::global_thread_id() / CGSize; - - [[maybe_unused]] auto const tile = - cg::tiled_partition(cg::this_thread_block()); - - if constexpr (CGSize == 1) { - while (idx < n) { - typename cuda::std::iterator_traits::value_type const& key = *(first + idx); - *(out + idx) = pred(*(stencil + idx)) ? ref.contains(key) : false; - idx += loop_stride; - } + if constexpr (Ref::tuning::use_warp_cooperative_contains_kernel && CGSize > 1) { + auto const idx = cuco::detail::global_thread_id(); + auto group = cg::tiled_partition(cg::this_thread_block()); + auto const in_range = idx < n; + auto const is_valid = in_range && pred(*(stencil + idx)); + auto const result = ref.contains_coop(group, first, idx, is_valid); + if (in_range) { *(output_begin + idx) = is_valid ? result : false; } } else { - auto const tile = cg::tiled_partition(cg::this_thread_block()); - while (idx < n) { - typename cuda::std::iterator_traits::value_type const& key = *(first + idx); - auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false; - if (tile.thread_rank() == 0) { *(out + idx) = found; } - idx += loop_stride; + auto const idx = cuco::detail::global_thread_id() / CGSize; + if (idx < n) { + if constexpr (CGSize == 1) { + if (pred(*(stencil + idx))) { + key_type const& key = *(first + idx); + *(output_begin + idx) = ref.contains(key); + } else { + *(output_begin + idx) = false; + } + } else { + auto group = cg::tiled_partition(cg::this_thread_block()); + bool result = false; + if (pred(*(stencil + idx))) { + key_type const& key = *(first + idx); + // ref.contains(group, key) already reduces across the group via group.all(...). + result = ref.contains(group, key); + } + if (group.thread_rank() == 0) { *(output_begin + idx) = result; } + } } } } diff --git a/include/cuco/detail/bloom_filter/parametric_filter_policy.cuh b/include/cuco/detail/bloom_filter/parametric_filter_policy.cuh new file mode 100644 index 000000000..c4e753249 --- /dev/null +++ b/include/cuco/detail/bloom_filter/parametric_filter_policy.cuh @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2025-2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +#include + +namespace cuco::detail { + +/** + * @brief Sectorized Bloom filter policy with multiplicative-hashing fingerprint generation. + * + * Implements the Sectorized Bloom Filter (SBF) variant from "Optimizing Bloom Filters for Modern + * GPU Architectures" (arXiv:2512.15595). Distributes `PatternBits` set bits across `WordsPerBlock` + * words using compile-time salt-based multiplicative hashing. The hash result is split into upper + * 32 bits (block selection via multiply-shift) and lower 32 bits (pattern generation), so a 64-bit + * hash function is required by design. + * + * @tparam Hash 64-bit hash functor whose return type satisfies `is_same_v`. + * @tparam Word Underlying word type of a filter block. Must be an atomically updatable integral. + * @tparam WordsPerBlock Words per filter block. Must be a power of two and <= 32. + * @tparam PatternBits Number of fingerprint bits (k in the paper). + * @tparam AddHorizontalLayout CG size used for `add` (paper's Theta). Must be a power of two and + * `AddHorizontalLayout * AddVerticalLayout <= WordsPerBlock`. + * @tparam AddVerticalLayout Contiguous words processed per thread per `add` step (paper's Phi). + * @tparam ContainsHorizontalLayout CG size used for `contains` (paper's Theta). + * @tparam ContainsVerticalLayout Contiguous words processed per thread per `contains` step (paper's + * Phi). + */ +template +class parametric_filter_policy { + public: + using hasher = Hash; ///< 64-bit hash functor type + using word_type = Word; ///< Underlying filter-block word type + using hash_argument_type = typename hasher::argument_type; ///< Hash function input type + using hash_result_type = + decltype(std::declval()(std::declval())); ///< Hash function + ///< output type + + private: + static constexpr uint32_t max_salts = 64; + static constexpr cuda::std::array salts = { + 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU, 0x705495c7U, 0x2df1424bU, 0x9efc4947U, + 0x5c6bfb31U, 0xb24bcdffU, 0xb6843d6dU, 0x6db04543U, 0x3a12efddU, 0xb0ddd463U, 0x8d22f6e7U, + 0xb82f1e53U, 0x7db9f86bU, 0xc7afe639U, 0xfb135cd7U, 0x693256e1U, 0x9466d871U, 0x23d3d02fU, + 0x6461d049U, 0x66a91621U, 0xbaa3006fU, 0x52fb8d99U, 0x3ea88b4fU, 0xf470cfdU, 0xb1db79a5U, + 0x9809fcd1U, 0xbced4445U, 0x2eb7c737U, 0x2cea6803U, 0x156f1955U, 0x8813c027U, 0xa26819f9U, + 0x4c3b57bdU, 0x7df94487U, 0xb975e769U, 0xb8f20cb5U, 0x5c9e2e77U, 0x5fb1735fU, 0x3a6f759bU, + 0x3c090923U, 0xfced424dU, 0xa187a6a9U, 0x6f070a41U, 0x2c85233bU, 0x7e62258bU, 0x2771ef17U, + 0x13bbf093U, 0x4ff059e5U, 0xe3ce3d0fU, 0xf1b4789fU, 0x9fbb6173U, 0x6a320cf5U, 0x1be2c481U, + 0x7ba8222bU, 0x6fd619b3U, 0x7b1bbf0dU, 0x8b8993adU, 0x448eca95U, 0x82ab09d9U, 0x2ce53909U, + 0x4f548685U}; + static constexpr uint32_t word_bits = cuda::std::numeric_limits::digits; + + public: + static constexpr uint32_t words_per_block = WordsPerBlock; ///< Number of words per filter block + static constexpr uint32_t pattern_bits = PatternBits; ///< Fingerprint bits per key + + static constexpr uint32_t add_horizontal_layout = + AddHorizontalLayout; ///< horizontal vectorization layout for add operation + static constexpr uint32_t add_vertical_layout = + AddVerticalLayout; ///< vertical vectorization layout for add operation + static constexpr uint32_t contains_horizontal_layout = + ContainsHorizontalLayout; ///< horizontal vectorization layout for contains operation + static constexpr uint32_t contains_vertical_layout = + ContainsVerticalLayout; ///< vertical vectorization layout for contains operation + + static constexpr size_t max_filter_blocks = + cuda::std::numeric_limits::max(); ///< Upper bound on the number of filter blocks + /// Lower bound on `pattern_bits`: at least one bit per word so every word contributes. + static constexpr auto min_pattern_bits = words_per_block; + /// Upper bound on `pattern_bits`: the total number of bits in a filter block, capped by the + /// number of available salts. + static constexpr auto max_pattern_bits = cuda::std::min(word_bits * words_per_block, max_salts); + + private: + static constexpr uint32_t bit_index_width = cuda::std::bit_width(word_bits - 1); + // TODO: for non-multiple `(pattern_bits, words_per_block)` configs (e.g. PatternBits=12, + // WordsPerBlock=8), the salt walk in `set_bits` advances `PatternArrayIndex` every + // `max_bits_per_word` salts, packing all bits into the first + // `ceil(pattern_bits / words_per_block)` words and leaving the rest at zero. This wastes block + // capacity and inflates FPR. Distribute floor bits to every word plus one extra bit to the + // first `pattern_bits % words_per_block` words, and update the salt-to-word mapping in + // `set_bits` accordingly. + static constexpr uint32_t max_bits_per_word = + cuco::detail::int_div_ceil(pattern_bits, words_per_block); + + public: + /** + * @brief Constructs a parametric filter policy. + * + * @param hash Hash function used to generate fingerprints. + */ + __host__ __device__ constexpr parametric_filter_policy(Hash hash = {}) : hash_{hash} + { + static_assert(pattern_bits >= min_pattern_bits, + "pattern_bits must be at least words_per_block"); + static_assert(pattern_bits <= max_pattern_bits, + "pattern_bits must be less than the total number of bits in a filter block"); + // Require exact tiling. With `words_per_block` a power of two, this is equivalent to requiring + // both `add_horizontal_layout` and `add_vertical_layout` to be powers of two with product + // <= `words_per_block`. The internal loop count uses integer division on the product; non- + // dividing layouts would leave trailing words uninserted on add while contains still expects + // non-zero patterns there, producing false negatives for every inserted key. + static_assert(words_per_block % (add_horizontal_layout * add_vertical_layout) == 0, + "add_horizontal_layout * add_vertical_layout must evenly divide words_per_block"); + static_assert( + words_per_block % (contains_horizontal_layout * contains_vertical_layout) == 0, + "contains_horizontal_layout * contains_vertical_layout must evenly divide words_per_block"); + // The split_hash() design requires a 64-bit hash split into upper 32 bits (block selection + // via multiply-shift) and lower 32 bits (pattern generation via salt-based multiplicative + // hashing). This is a permanent design requirement, not a temporary limitation. + static_assert(cuda::std::is_same_v, + "parametric_filter_policy requires a 64-bit hash function"); + } + + /** + * @brief Splits the 64-bit hash of a key into its upper and lower 32 bits. + * + * The upper half is used for block selection (via multiply-shift); the lower half drives the + * per-word fingerprint pattern via salt-based multiplicative hashing. + * + * @param key Key to hash. + * + * @return `{upper 32 bits, lower 32 bits}` of the 64-bit hash. + */ + __device__ constexpr cuda::std::pair split_hash(hash_argument_type key) const + { + auto const hash_value = hash_(key); + return {static_cast(hash_value >> 32), static_cast(hash_value)}; + } + + /** + * @brief Determines the filter block a key maps to via fast multiply-shift modulo. + * + * @tparam Extent Size type used to determine the number of blocks in the filter. + * + * @param upper_hash_value Upper 32 bits of the key's hash. + * @param num_blocks Number of blocks in the filter. + * + * @return Block index in `[0, num_blocks)`. + */ + template + __device__ constexpr auto block_index(uint32_t upper_hash_value, Extent num_blocks) const + { + return static_cast((static_cast(upper_hash_value) * + static_cast(num_blocks)) >> + 32); + } + + /** + * @brief Generates the per-word fingerprint pattern for a key when the horizontal layout is 1. + * + * @tparam LoopIndex Outer-loop iteration index when `words_per_block / VerticalLayout > 1`. + * @tparam VerticalLayout Number of contiguous words this call produces. + * + * @param lower_hash_value Lower 32 bits of the key's hash. + * + * @return Array of `VerticalLayout` words. + */ + template + __device__ constexpr auto array_pattern(uint32_t lower_hash_value) const + { + return pattern_impl(lower_hash_value); + } + + /** + * @brief Generates the per-word fingerprint pattern for a key when the horizontal layout is > 1. + * + * @tparam LoopIndex Outer-loop iteration index. + * @tparam HorizontalLayout Cooperative-group size cooperating on a single key. + * @tparam VerticalLayout Number of contiguous words this call produces. + * + * @param lower_hash_value Lower 32 bits of the key's hash. + * @param thread_index Caller's rank within the cooperative group. + * + * @return Array of `VerticalLayout` words owned by the calling thread. + */ + template + __device__ constexpr auto array_pattern(uint32_t lower_hash_value, uint32_t thread_index) const + { + return pattern_impl(lower_hash_value, + thread_index); + } + + private: + hasher hash_; + + /** + * @brief pattern_impl - Computes the bit pattern for a vertical layout of words. + * I use the terminology of a `virtual thread` to refer to an ordering of the vertical layouts, + * namely + * virtual_thread_index = LoopIndex * HorizontalLayout + thread_index, + * where LoopIndex is the index of the outermost loop in the range: + * [0, words_per_block / (HorizontalLayout * VerticalLayout)). + * @param hash + * @return cuda::std::array - The bit pattern for the vertical layout + * defined by the LoopIndex. + */ + + // Precondition: _horizontal_layout == 1 + template + __device__ constexpr auto pattern_impl(uint32_t hash) const + { + using pattern_array_t = cuda::std::array; + + // Sanity check + constexpr uint32_t num_iterations = words_per_block / VerticalLayout; + static_assert(LoopIndex < num_iterations, + "the loop index cannot exceed the number of loop iterations"); + + pattern_array_t pattern_array{0}; + constexpr uint32_t salt_start_index = max_bits_per_word * VerticalLayout * LoopIndex; + constexpr uint32_t salt_end_index = + cuda::std::min(salt_start_index + max_bits_per_word * VerticalLayout, pattern_bits); + constexpr uint32_t pattern_array_start_index = 0; + set_bits(hash, pattern_array); + return pattern_array; + } + + // Precondition: _horizontal_layout > 1 + template + __device__ constexpr auto pattern_impl(uint32_t hash, uint32_t thread_index) const + { + using pattern_array_t = cuda::std::array; + + // Sanity check + constexpr uint32_t num_iterations = words_per_block / (HorizontalLayout * VerticalLayout); + static_assert(LoopIndex < num_iterations, + "the loop index cannot exceed the number of loop iterations"); + + // [lower_bound, upper_bound) defines the range of virtual thread indices for this loop + // iteration. + constexpr uint32_t lower_bound = LoopIndex * HorizontalLayout; + constexpr uint32_t upper_bound = lower_bound + HorizontalLayout; + + // A virtual thread flips max_bits_per_virtual_thread bits in the pattern array, excepting + // potentially some of the last virtual threads (if pattern_bits % words_per_block != 0). + constexpr uint32_t max_bits_per_virtual_thread = max_bits_per_word * VerticalLayout; + + pattern_array_t pattern_array{0}; + if constexpr (num_iterations == 1) { + thread_dispatch( + hash, thread_index, pattern_array); + } else { + uint32_t const virtual_thread_index = LoopIndex * HorizontalLayout + thread_index; + thread_dispatch( + hash, virtual_thread_index, pattern_array); + } + return pattern_array; + } + + // Dispatches a dynamic virtual thread index to a static virtual thread index by building a + // compile-time decision tree over the range [LowerBound, UpperBound) for the virtual thread + // index. This method is only used when _horizontal_layout > 1. + template + __device__ constexpr void thread_dispatch(uint32_t hash, + uint32_t thread_index, + PatternArrayT& pattern_array) const + { + // Sanity check + static_assert(LowerBound < UpperBound); + + if constexpr (LowerBound + 1 == UpperBound) { + // Base case: thread_index == LowerBound + constexpr uint32_t salt_start_index = MaxBitsPerVirtualThread * LowerBound; + constexpr uint32_t salt_end_index = + cuda::std::min(salt_start_index + MaxBitsPerVirtualThread, pattern_bits); + constexpr uint32_t pattern_array_start_index = 0; + set_bits(hash, pattern_array); + } else { + // Recursive case: thread_index > LowerBound + constexpr uint32_t mid = (LowerBound + UpperBound) / 2; + if (thread_index < mid) { + thread_dispatch( + hash, thread_index, pattern_array); + } else { + thread_dispatch( + hash, thread_index, pattern_array); + } + } + } + + // Set bits in the pattern array using salts starting from SaltIndex. + template + __device__ constexpr void set_bits(uint32_t hash, PatternArrayT& pattern_array) const + { + if constexpr (SaltIndex < SaltEndIndex) { + // Select top bit_index_width bits from salted hash to determine the bit index. + uint32_t const bit_index = + (cuda::std::get(salts) * hash) >> (32 - bit_index_width); + + // Set the bit in the pattern array. + cuda::std::get(pattern_array) |= word_type{1} << bit_index; + + // Recurse. + constexpr uint32_t next_salt_index = SaltIndex + 1; + constexpr uint32_t next_pattern_array_index = + PatternArrayIndex + (next_salt_index % max_bits_per_word == 0 ? 1 : 0); + set_bits(hash, pattern_array); + } + } +}; + +} // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/utility/cuda.hpp b/include/cuco/detail/utility/cuda.hpp index ae2613f21..5d6342427 100644 --- a/include/cuco/detail/utility/cuda.hpp +++ b/include/cuco/detail/utility/cuda.hpp @@ -75,5 +75,16 @@ constexpr auto max_occupancy_grid_size(std::int32_t block_size, return max_active_blocks_per_multiprocessor * num_multiprocessors; } +inline auto l2_cache_size() +{ + int device = 0; + CUCO_CUDA_TRY(cudaGetDevice(&device)); + + int l2_cache_size = -1; + CUCO_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_size, cudaDevAttrL2CacheSize, device)); + + return l2_cache_size; +} + } // namespace detail } // namespace cuco diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 51d4f42c1..09c31ee35 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -154,9 +154,11 @@ ConfigureTest(HYPERLOGLOG_TEST # - bloom_filter ---------------------------------------------------------------------------------- ConfigureTest(BLOOM_FILTER_TEST bloom_filter/unique_sequence_test.cu - bloom_filter/arrow_policy_test.cu bloom_filter/variable_cg_test.cu - bloom_filter/merge_intersect_test.cu) + bloom_filter/merge_intersect_test.cu + bloom_filter/arrow_compat_test.cu + bloom_filter/bulk_ref_equivalence_test.cu + bloom_filter/layout_equivalence_test.cu) ################################################################################################### # - roaring_bitmap --------------------------------------------------------------------------------- diff --git a/tests/bloom_filter/arrow_policy_test.cu b/tests/bloom_filter/arrow_compat_test.cu similarity index 82% rename from tests/bloom_filter/arrow_policy_test.cu rename to tests/bloom_filter/arrow_compat_test.cu index 1ca349162..2e820fde2 100644 --- a/tests/bloom_filter/arrow_policy_test.cu +++ b/tests/bloom_filter/arrow_compat_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,18 +14,23 @@ * limitations under the License. */ +// Verifies that a `parametric_filter_policy` instantiation with Apache Arrow's Block-Split Bloom +// Filter parameters (256-bit blocks of 8 x uint32_t, 8 fingerprint bits per key, fully horizontal +// add, fully vertical contains) produces byte-identical bitsets to a precomputed Arrow reference. + #include #include #include #include -#include #include -#include +#include #include +#include +#include namespace { @@ -73,11 +78,11 @@ thrust::device_vector get_arrow_filter_reference_bitset() }; if constexpr (std::is_same_v) { - return reference_bitsets[0]; // int32 + return reference_bitsets[0]; } else if constexpr (std::is_same_v) { - return reference_bitsets[1]; // int64 + return reference_bitsets[1]; } else if constexpr (std::is_same_v) { - return reference_bitsets[2]; // float + return reference_bitsets[2]; } else { throw std::invalid_argument("Reference bitsets available for int32, int64, float only.\n\n"); } @@ -93,11 +98,11 @@ std::pair get_arrow_filter_test_settings() }; if constexpr (std::is_same_v) { - return test_settings[0]; // int32 + return test_settings[0]; } else if constexpr (std::is_same_v) { - return test_settings[1]; // int64 + return test_settings[1]; } else if constexpr (std::is_same_v) { - return test_settings[2]; // float + return test_settings[2]; } else { throw std::invalid_argument("Test settings available for int32, int64, float only.\n\n"); } @@ -119,20 +124,14 @@ void test_filter_bitset(Filter& filter, size_t num_keys) using key_type = typename Filter::key_type; using word_type = typename Filter::word_type; - // Generate keys auto const h_keys = sequence_values(num_keys); thrust::device_vector d_keys(h_keys.begin(), h_keys.end()); - // Insert to the bloom filter filter.add(d_keys.begin(), d_keys.begin() + num_keys); - // Get reference words device_vector auto const reference_words = get_arrow_filter_reference_bitset(); + auto const num_words = filter.block_extent() * filter.words_per_block; - // Number of words in the filter - auto const num_words = filter.block_extent() * filter.words_per_block; - - // Get the bitset thrust::device_vector filter_words(filter.data(), filter.data() + num_words); REQUIRE(cuco::test::equal( @@ -144,17 +143,19 @@ void test_filter_bitset(Filter& filter, size_t num_keys) }))); } -TEMPLATE_TEST_CASE_SIG("bloom_filter arrow filter policy bitset validation", +TEMPLATE_TEST_CASE_SIG("bloom_filter arrow-compatible parametric policy bitset validation", "", (class Key), (int32_t), (int64_t), (float)) { - // Get test settings auto const [sub_filters, num_keys] = get_arrow_filter_test_settings(); - using policy_type = cuco::arrow_filter_policy; + // Apache Arrow Block-Split Bloom Filter parameters: 256-bit blocks (8 x uint32_t), 8 fingerprint + // bits per key, fully horizontal add (Theta=8) and fully vertical contains (Phi=8). + using policy_type = + cuco::parametric_filter_policy, std::uint32_t, 8, 8, 8, 1, 1, 8>; cuco::bloom_filter, cuda::thread_scope_device, policy_type> filter{ sub_filters}; diff --git a/tests/bloom_filter/bulk_ref_equivalence_test.cu b/tests/bloom_filter/bulk_ref_equivalence_test.cu new file mode 100644 index 000000000..9206563f8 --- /dev/null +++ b/tests/bloom_filter/bulk_ref_equivalence_test.cu @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Host-bulk vs device-ref equivalence. The host-bulk APIs route through `add_n` / +// `contains_n` in `kernels.cuh`; the device-ref APIs route through scalar / CG / CG-range +// methods on `bloom_filter_ref` directly. Both paths share `policy_.array_pattern(...)`, +// so given the same input keys both must produce byte-identical filter bitsets (add side) +// and byte-identical query results (contains side). Catches regressions like the +// `add_coop(group, first, idx, is_valid)` iterator-formation UB fix where one path +// silently diverged from the other. + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +namespace cg = cooperative_groups; + +using size_type = int32_t; + +template +__global__ void scalar_add_kernel(Ref ref, Key const* keys, size_type n) +{ + auto const i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i < n) { ref.add(keys[i]); } +} + +template +__global__ void cg_add_kernel(Ref ref, Key const* keys, size_type n) +{ + auto const block = cg::this_thread_block(); + auto const tile = cg::tiled_partition(block); + auto const idx = static_cast((blockIdx.x * blockDim.x + threadIdx.x) / CGSize); + if (idx < n) { ref.add(tile, keys[idx]); } +} + +template +__global__ void cg_range_add_kernel(Ref ref, Key const* first, Key const* last) +{ + auto const block = cg::this_thread_block(); + auto const tile = cg::tiled_partition(block); + ref.add(tile, first, last); +} + +template +__global__ void scalar_contains_kernel(Ref ref, Key const* keys, size_type n, bool* out) +{ + auto const i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i < n) { out[i] = ref.contains(keys[i]); } +} + +template +__global__ void cg_contains_kernel(Ref ref, Key const* keys, size_type n, bool* out) +{ + auto const block = cg::this_thread_block(); + auto const tile = cg::tiled_partition(block); + auto const idx = static_cast((blockIdx.x * blockDim.x + threadIdx.x) / CGSize); + if (idx < n) { + auto const found = ref.contains(tile, keys[idx]); + if (tile.thread_rank() == 0) { out[idx] = found; } + } +} + +template +__global__ void cg_range_contains_kernel(Ref ref, Key const* first, Key const* last, bool* out) +{ + auto const block = cg::this_thread_block(); + auto const tile = cg::tiled_partition(block); + ref.contains(tile, first, last, out); +} + +TEMPLATE_TEST_CASE_SIG( + "bloom_filter: host bulk add equals device ref add", + "", + ((class Key, class Policy), Key, Policy), + (int32_t, cuco::default_filter_policy), + (int32_t, cuco::parametric_filter_policy, uint32_t, 1, 1, 1, 1, 1, 1>), + (uint64_t, + cuco::parametric_filter_policy, uint32_t, 8, 12, 8, 1, 4, 2>), + (float, cuco::parametric_filter_policy, uint64_t, 4, 4, 2, 2, 1, 2>), + (int32_t, cuco::parametric_filter_policy, uint32_t, 8, 8, 2, 2, 1, 8>)) +{ + using filter_type = + cuco::bloom_filter, cuda::thread_scope_device, Policy>; + // Prime: forces a partial boundary tile (mix of valid + invalid lanes) on every + // CGSize > 1 path, exercising the cooperative is_valid mask logic. + constexpr size_type num_keys = 397; + + thrust::device_vector keys(num_keys); + thrust::sequence(thrust::device, keys.begin(), keys.end()); + auto const keys_raw = thrust::raw_pointer_cast(keys.data()); + + SECTION("scalar ref.add") + { + auto filter_a = filter_type{1000}; + auto filter_b = filter_type{1000}; + + filter_a.add(keys.begin(), keys.end()); + + auto ref = filter_b.ref(); + constexpr int block_size = 128; + int const grid_size = (num_keys + block_size - 1) / block_size; + scalar_add_kernel<<>>(ref, keys_raw, num_keys); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + auto const total_words = + static_cast(filter_a.block_extent()) * filter_type::words_per_block; + REQUIRE(thrust::equal( + thrust::device, filter_a.data(), filter_a.data() + total_words, filter_b.data())); + } + + SECTION("CG ref.add(group, key)") + { + constexpr int CGSize = Policy::add_horizontal_layout; + auto filter_a = filter_type{1000}; + auto filter_b = filter_type{1000}; + + filter_a.add(keys.begin(), keys.end()); + + auto ref = filter_b.ref(); + constexpr int block_size = 128; + int const grid_size = (num_keys * CGSize + block_size - 1) / block_size; + cg_add_kernel<<>>(ref, keys_raw, num_keys); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + auto const total_words = + static_cast(filter_a.block_extent()) * filter_type::words_per_block; + REQUIRE(thrust::equal( + thrust::device, filter_a.data(), filter_a.data() + total_words, filter_b.data())); + } + + SECTION("CG ref.add(group, first, last)") + { + constexpr int CGSize = Policy::add_horizontal_layout; + auto filter_a = filter_type{1000}; + auto filter_b = filter_type{1000}; + + filter_a.add(keys.begin(), keys.end()); + + auto ref = filter_b.ref(); + // Single tile processes the entire range cooperatively. + cg_range_add_kernel<<<1, CGSize>>>(ref, keys_raw, keys_raw + num_keys); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + auto const total_words = + static_cast(filter_a.block_extent()) * filter_type::words_per_block; + REQUIRE(thrust::equal( + thrust::device, filter_a.data(), filter_a.data() + total_words, filter_b.data())); + } +} + +TEMPLATE_TEST_CASE_SIG( + "bloom_filter: host bulk contains equals device ref contains", + "", + ((class Key, class Policy), Key, Policy), + (int32_t, cuco::default_filter_policy), + (int32_t, cuco::parametric_filter_policy, uint32_t, 1, 1, 1, 1, 1, 1>), + (uint64_t, + cuco::parametric_filter_policy, uint32_t, 8, 12, 8, 1, 4, 2>), + (float, cuco::parametric_filter_policy, uint64_t, 4, 4, 2, 2, 1, 2>), + (int32_t, cuco::parametric_filter_policy, uint32_t, 8, 8, 2, 2, 1, 8>)) +{ + using filter_type = + cuco::bloom_filter, cuda::thread_scope_device, Policy>; + // Primes: force partial boundary tiles on both insert and probe ranges across every + // CGSize > 1 path. + constexpr size_type num_keys = 397; + constexpr size_type num_probe = 797; // mix of inserted (first ~half) and non-inserted (rest) + + auto filter = filter_type{1000}; + + thrust::device_vector insert_keys(num_keys); + thrust::sequence(thrust::device, insert_keys.begin(), insert_keys.end()); + filter.add(insert_keys.begin(), insert_keys.end()); + + thrust::device_vector probe_keys(num_probe); + thrust::sequence(thrust::device, probe_keys.begin(), probe_keys.end()); + auto const probe_raw = thrust::raw_pointer_cast(probe_keys.data()); + + thrust::device_vector bulk_result(num_probe); + filter.contains(probe_keys.begin(), probe_keys.end(), bulk_result.begin()); + + SECTION("scalar ref.contains") + { + thrust::device_vector ref_result(num_probe); + auto ref = filter.ref(); + constexpr int block_size = 128; + int const grid_size = (num_probe + block_size - 1) / block_size; + scalar_contains_kernel<<>>( + ref, probe_raw, num_probe, thrust::raw_pointer_cast(ref_result.data())); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + REQUIRE(cuco::test::equal( + bulk_result.begin(), bulk_result.end(), ref_result.begin(), cuda::std::equal_to{})); + } + + SECTION("CG ref.contains(group, key)") + { + constexpr int CGSize = Policy::contains_horizontal_layout; + thrust::device_vector ref_result(num_probe); + auto ref = filter.ref(); + constexpr int block_size = 128; + int const grid_size = (num_probe * CGSize + block_size - 1) / block_size; + cg_contains_kernel<<>>( + ref, probe_raw, num_probe, thrust::raw_pointer_cast(ref_result.data())); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + REQUIRE(cuco::test::equal( + bulk_result.begin(), bulk_result.end(), ref_result.begin(), cuda::std::equal_to{})); + } + + SECTION("device-range CG ref.contains(group, first, last, out)") + { + constexpr int CGSize = Policy::contains_horizontal_layout; + thrust::device_vector ref_result(num_probe); + auto ref = filter.ref(); + // Single tile processes the entire range cooperatively. + cg_range_contains_kernel<<<1, CGSize>>>( + ref, probe_raw, probe_raw + num_probe, thrust::raw_pointer_cast(ref_result.data())); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + REQUIRE(cuco::test::equal( + bulk_result.begin(), bulk_result.end(), ref_result.begin(), cuda::std::equal_to{})); + } +} diff --git a/tests/bloom_filter/layout_equivalence_test.cu b/tests/bloom_filter/layout_equivalence_test.cu new file mode 100644 index 000000000..49b8774f4 --- /dev/null +++ b/tests/bloom_filter/layout_equivalence_test.cu @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Byte-equal bitsets across (AddH, AddV) layout permutations, identical contains results +// across (ContainsH, ContainsV) layout permutations, and equivalence between dynamic vs +// static `cuco::extent` -- all for fixed (Hash, Word, WordsPerBlock, PatternBits, keys). + +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include + +TEMPLATE_TEST_CASE_SIG( + "bloom_filter: bitset is invariant under (AddH, AddV) layout permutations", + "", + ((class AltPolicy), AltPolicy), + (cuco::parametric_filter_policy, uint32_t, 8, 8, 1, 8, 1, 8>), + (cuco::parametric_filter_policy, uint32_t, 8, 8, 2, 4, 1, 8>), + (cuco::parametric_filter_policy, uint32_t, 8, 8, 4, 2, 1, 8>), + (cuco::parametric_filter_policy, uint32_t, 8, 8, 2, 2, 1, 8>), + (cuco::parametric_filter_policy, uint32_t, 8, 8, 4, 1, 1, 8>)) +{ + using Key = int32_t; + using default_policy = cuco::default_filter_policy; + using filter_default_t = + cuco::bloom_filter, cuda::thread_scope_device, default_policy>; + using filter_alt_t = + cuco::bloom_filter, cuda::thread_scope_device, AltPolicy>; + + constexpr int32_t num_blocks = 1'000; + constexpr int32_t num_keys = 400; + + auto filter_default = filter_default_t{num_blocks}; + auto filter_alt = filter_alt_t{num_blocks}; + + thrust::device_vector keys(num_keys); + thrust::sequence(thrust::device, keys.begin(), keys.end()); + + filter_default.add(keys.begin(), keys.end()); + filter_alt.add(keys.begin(), keys.end()); + + auto const total_words = + static_cast(filter_default.block_extent()) * filter_default_t::words_per_block; + REQUIRE(thrust::equal( + thrust::device, filter_default.data(), filter_default.data() + total_words, filter_alt.data())); +} + +TEMPLATE_TEST_CASE_SIG( + "bloom_filter: contains results are invariant under (ContainsH, ContainsV) permutations", + "", + ((class AltPolicy), AltPolicy), + (cuco::parametric_filter_policy, uint32_t, 8, 8, 8, 1, 8, 1>), + (cuco::parametric_filter_policy, uint32_t, 8, 8, 8, 1, 2, 4>), + (cuco::parametric_filter_policy, uint32_t, 8, 8, 8, 1, 4, 2>), + (cuco::parametric_filter_policy, uint32_t, 8, 8, 8, 1, 2, 2>), + (cuco::parametric_filter_policy, uint32_t, 8, 8, 8, 1, 1, 4>)) +{ + using Key = int32_t; + using default_policy = cuco::default_filter_policy; + using filter_default_t = + cuco::bloom_filter, cuda::thread_scope_device, default_policy>; + using filter_alt_t = + cuco::bloom_filter, cuda::thread_scope_device, AltPolicy>; + + constexpr int32_t num_blocks = 1'000; + constexpr int32_t num_keys = 400; + constexpr int32_t num_probe = 800; // mix of inserted and disjoint + + auto filter_default = filter_default_t{num_blocks}; + auto filter_alt = filter_alt_t{num_blocks}; + + thrust::device_vector insert_keys(num_keys); + thrust::sequence(thrust::device, insert_keys.begin(), insert_keys.end()); + filter_default.add(insert_keys.begin(), insert_keys.end()); + filter_alt.add(insert_keys.begin(), insert_keys.end()); + + thrust::device_vector probe_keys(num_probe); + thrust::sequence(thrust::device, probe_keys.begin(), probe_keys.end()); + + thrust::device_vector result_default(num_probe); + thrust::device_vector result_alt(num_probe); + filter_default.contains(probe_keys.begin(), probe_keys.end(), result_default.begin()); + filter_alt.contains(probe_keys.begin(), probe_keys.end(), result_alt.begin()); + + REQUIRE(thrust::equal( + thrust::device, result_default.begin(), result_default.end(), result_alt.begin())); +} + +TEST_CASE("bloom_filter: bitset is invariant under dynamic vs static cuco::extent", "") +{ + using Key = int32_t; + using Policy = cuco::default_filter_policy; + constexpr std::size_t num_blocks = 1'000; + constexpr int32_t num_keys = 400; + + using dynamic_extent_t = cuco::extent; + using static_extent_t = cuco::extent; + using filter_dynamic_t = + cuco::bloom_filter; + using filter_static_t = + cuco::bloom_filter; + + auto filter_dynamic = filter_dynamic_t{num_blocks}; + auto filter_static = filter_static_t{static_extent_t{}}; + + thrust::device_vector keys(num_keys); + thrust::sequence(thrust::device, keys.begin(), keys.end()); + + filter_dynamic.add(keys.begin(), keys.end()); + filter_static.add(keys.begin(), keys.end()); + + auto const total_words = num_blocks * filter_dynamic_t::words_per_block; + REQUIRE(thrust::equal(thrust::device, + filter_dynamic.data(), + filter_dynamic.data() + total_words, + filter_static.data())); +} diff --git a/tests/bloom_filter/merge_intersect_test.cu b/tests/bloom_filter/merge_intersect_test.cu index 5638d58a8..a83c3bcb2 100644 --- a/tests/bloom_filter/merge_intersect_test.cu +++ b/tests/bloom_filter/merge_intersect_test.cu @@ -162,42 +162,16 @@ TEMPLATE_TEST_CASE_SIG( "bloom_filter merge and intersect tests", "", ((class Key, class Policy), Key, Policy), - (int32_t, cuco::default_filter_policy, uint32_t, 1>), - (int32_t, cuco::default_filter_policy, uint32_t, 8>), - (int64_t, cuco::default_filter_policy, uint64_t, 1>), - (int64_t, cuco::default_filter_policy, uint64_t, 8>)) + (int32_t, cuco::default_filter_policy), + (int32_t, cuco::parametric_filter_policy, uint32_t, 1, 1, 1, 1, 1, 1>), + (int32_t, cuco::parametric_filter_policy, uint32_t, 8, 8, 8, 1, 1, 8>), + (int64_t, cuco::parametric_filter_policy, uint64_t, 1, 1, 1, 1, 1, 1>), + (int64_t, cuco::parametric_filter_policy, uint64_t, 8, 8, 8, 1, 1, 8>)) { using filter_type = cuco::bloom_filter, cuda::thread_scope_device, Policy>; constexpr size_type capacity{1000}; - uint32_t pattern_bits = Policy::words_per_block + GENERATE(0, 1); - - // some parameter combinations might be invalid so we skip them - try { - [[maybe_unused]] auto policy = Policy{pattern_bits}; - } catch (std::exception const& e) { - SKIP(e.what()); - } - - auto filter_a = filter_type{capacity, {}, {pattern_bits}}; - auto filter_b = filter_type{capacity, {}, {pattern_bits}}; - auto filter_c = filter_type{static_cast(capacity) * 2, {}, {pattern_bits}}; - - test_merge_intersect(filter_a, filter_b, filter_c, capacity); -} - -TEMPLATE_TEST_CASE_SIG("bloom_filter merge and intersect arrow tests", - "", - ((class Key, class Policy), Key, Policy), - (int32_t, cuco::arrow_filter_policy), - (int64_t, cuco::arrow_filter_policy), - (float, cuco::arrow_filter_policy)) -{ - using filter_type = - cuco::bloom_filter, cuda::thread_scope_device, Policy>; - constexpr size_type capacity{1000}; // Must match capacity used in helper logic - auto filter_a = filter_type{capacity}; auto filter_b = filter_type{capacity}; auto filter_c = filter_type{static_cast(capacity) * 2}; diff --git a/tests/bloom_filter/unique_sequence_test.cu b/tests/bloom_filter/unique_sequence_test.cu index 4cf24563c..0c212f755 100644 --- a/tests/bloom_filter/unique_sequence_test.cu +++ b/tests/bloom_filter/unique_sequence_test.cu @@ -86,38 +86,15 @@ void test_unique_sequence(Filter& filter, size_type num_keys) } TEMPLATE_TEST_CASE_SIG( - "bloom_filter default policy tests", + "bloom_filter parametric policy tests", "", ((class Key, class Policy), Key, Policy), - (int32_t, cuco::default_filter_policy, uint32_t, 1>), - (int32_t, cuco::default_filter_policy, uint32_t, 8>), - (int32_t, cuco::default_filter_policy, uint64_t, 1>), - (int32_t, cuco::default_filter_policy, uint64_t, 8>)) -{ - using filter_type = - cuco::bloom_filter, cuda::thread_scope_device, Policy>; - constexpr size_type num_keys{400}; - - uint32_t pattern_bits = Policy::words_per_block + GENERATE(0, 1, 2, 3, 4); - - // some parameter combinations might be invalid so we skip them - try { - [[maybe_unused]] auto policy = Policy{pattern_bits}; - } catch (std::exception const& e) { - SKIP(e.what()); - } - - auto filter = filter_type{1000, {}, {pattern_bits}}; - - test_unique_sequence(filter, num_keys); -} - -TEMPLATE_TEST_CASE_SIG("bloom_filter arrow policy tests", - "", - ((class Key, class Policy), Key, Policy), - (int32_t, cuco::arrow_filter_policy), - (uint64_t, cuco::arrow_filter_policy), - (float, cuco::arrow_filter_policy)) + (int32_t, cuco::default_filter_policy), + (int32_t, cuco::parametric_filter_policy, uint32_t, 1, 1, 1, 1, 1, 1>), + (uint64_t, + cuco::parametric_filter_policy, uint32_t, 8, 12, 8, 1, 4, 2>), + (float, cuco::parametric_filter_policy, uint64_t, 4, 4, 2, 2, 1, 2>), + (int32_t, cuco::parametric_filter_policy, uint32_t, 8, 8, 2, 2, 1, 8>)) { using filter_type = cuco::bloom_filter, cuda::thread_scope_device, Policy>; diff --git a/tests/bloom_filter/variable_cg_test.cu b/tests/bloom_filter/variable_cg_test.cu index adb6b703f..6a7a21d51 100644 --- a/tests/bloom_filter/variable_cg_test.cu +++ b/tests/bloom_filter/variable_cg_test.cu @@ -14,100 +14,177 @@ * limitations under the License. */ +// Exercises `cuco::bloom_filter_ref` device-side APIs (`ref.add(key)`, `ref.contains(key)`, +// `ref.contains(group, key)`) directly from custom kernels, varying the cooperative group size. +// The bulk host-side APIs route through the warp-cooperative kernel which already had correct +// CG-reduction semantics; the scalar and CG ref methods are separate code paths that need their +// own coverage. + #include #include -#include -#include +#include #include #include -#include #include +#include + #include -#include #include -#include using size_type = int32_t; -template -void test_variable_cg_size(Filter& filter, size_type num_keys) +namespace cg = cooperative_groups; + +template +__global__ void scalar_add_kernel(Ref ref, Key const* keys, size_type n) { - constexpr int32_t block_size = 128; - constexpr int32_t grid_size = 128; + auto const i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i < n) { ref.add(keys[i]); } +} - using Key = typename Filter::key_type; +template +__global__ void scalar_contains_kernel(Ref ref, Key const* keys, size_type n, bool* out) +{ + auto const i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (i < n) { out[i] = ref.contains(keys[i]); } +} - auto ref = filter.ref(); +template +__global__ void cg_contains_consistency_kernel(Ref ref, + Key const* keys, + size_type n, + int* mismatches) +{ + auto const block = cg::this_thread_block(); + auto const tile = cg::tiled_partition(block); + auto const idx = static_cast((blockIdx.x * blockDim.x + threadIdx.x) / CGSize); + if (idx >= n) { return; } + bool const got = ref.contains(tile, keys[idx]); + bool const all_agree = tile.all(got); + bool const any_agree = tile.any(got); + if (tile.thread_rank() == 0 && all_agree != any_agree) { atomicAdd(mismatches, 1); } +} + +template +__global__ void cooperative_clear_kernel(Ref ref) +{ + auto const block = cg::this_thread_block(); + auto const tile = cg::tiled_partition(block); + ref.clear(tile); +} + +TEMPLATE_TEST_CASE_SIG( + "bloom_filter device ref scalar add and contains", + "", + ((class Key, class Policy), Key, Policy), + (int32_t, cuco::default_filter_policy), + (int32_t, cuco::parametric_filter_policy, uint32_t, 1, 1, 1, 1, 1, 1>), + (int32_t, cuco::parametric_filter_policy, uint32_t, 8, 8, 4, 2, 4, 2>)) +{ + using filter_type = + cuco::bloom_filter, cuda::thread_scope_device, Policy>; + constexpr size_type num_keys{400}; + + auto filter = filter_type{1000}; - // Generate keys thrust::device_vector keys(num_keys); thrust::sequence(thrust::device, keys.begin(), keys.end()); - thrust::device_vector contained(num_keys, false); - auto const always_true = cuda::constant_iterator{true}; - - SECTION("Check if fallback kernels work for varying combinations of CG sizes.") - { - cuco::detail::bloom_filter_ns::add_if_n - <<>>(keys.begin(), num_keys, always_true, cuda::std::identity{}, ref); - cuco::detail::bloom_filter_ns::contains_if_n - <<>>( - keys.begin(), num_keys, always_true, cuda::std::identity{}, contained.begin(), ref); - REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{})); - } - - filter.clear(); - thrust::fill(contained.begin(), contained.end(), false); // reset output vector - - SECTION("Check if adaptive add kernel works with fallback contains kernel.") - { - cuco::detail::bloom_filter_ns::add - <<>>(keys.begin(), num_keys, ref); - cuco::detail::bloom_filter_ns::contains_if_n - <<>>( - keys.begin(), num_keys, always_true, cuda::std::identity{}, contained.begin(), ref); - REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{})); - } - - // TODO adaptive vs. adaptive and fallback add vs. adaptive contains (requires #673) + auto ref = filter.ref(); + + constexpr int block_size = 128; + int const grid_size = (num_keys + block_size - 1) / block_size; + + scalar_add_kernel<<>>( + ref, thrust::raw_pointer_cast(keys.data()), num_keys); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + scalar_contains_kernel<<>>(ref, + thrust::raw_pointer_cast(keys.data()), + num_keys, + thrust::raw_pointer_cast(contained.data())); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{})); } TEMPLATE_TEST_CASE_SIG( - "bloom_filter variable CG size tests", + "bloom_filter device ref CG contains is reduced across the group", "", - ((int32_t AddCGSize, int32_t ContainsCGSize, class Key, class Policy), - AddCGSize, - ContainsCGSize, - Key, - Policy), - (1, 4, int32_t, cuco::default_filter_policy, uint32_t, 1>), - (1, 4, int32_t, cuco::default_filter_policy, uint32_t, 8>), - (1, 4, int32_t, cuco::default_filter_policy, uint64_t, 1>), - (1, 4, int32_t, cuco::default_filter_policy, uint64_t, 8>), - (4, 1, int32_t, cuco::default_filter_policy, uint32_t, 1>), - (4, 1, int32_t, cuco::default_filter_policy, uint32_t, 8>), - (4, 1, int32_t, cuco::default_filter_policy, uint64_t, 1>), - (4, 1, int32_t, cuco::default_filter_policy, uint64_t, 8>)) + ((int32_t CGSize, class Key, class Policy), CGSize, Key, Policy), + (4, + int32_t, + cuco::parametric_filter_policy, uint32_t, 8, 8, 4, 2, 4, 2>), + (8, + int32_t, + cuco::parametric_filter_policy, uint32_t, 8, 8, 8, 1, 8, 1>)) +{ + using filter_type = + cuco::bloom_filter, cuda::thread_scope_device, Policy>; + // Small filter + many probes drive non-inserted keys into the partial-match regime where + // different lanes in the tile see different per-slice match results. Without a group reduction + // in `ref.contains(group, key)`, each lane returns its slice's partial result and the tile + // disagrees on the answer. + constexpr size_type num_inserted{200}; + constexpr size_type num_probed{2000}; + constexpr size_type num_blocks{16}; + + auto filter = filter_type{num_blocks}; + auto ref = filter.ref(); + + thrust::device_vector insert_keys(num_inserted); + thrust::sequence(thrust::device, insert_keys.begin(), insert_keys.end()); + filter.add(insert_keys.begin(), insert_keys.end()); + + thrust::device_vector probe_keys(num_probed); + thrust::sequence(thrust::device, probe_keys.begin(), probe_keys.end()); + + thrust::device_vector mismatches(1, 0); + + constexpr int block_size = 128; + int const grid_size = (num_probed * CGSize + block_size - 1) / block_size; + + cg_contains_consistency_kernel + <<>>(ref, + thrust::raw_pointer_cast(probe_keys.data()), + num_probed, + thrust::raw_pointer_cast(mismatches.data())); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + REQUIRE(static_cast(mismatches[0]) == 0); +} + +TEMPLATE_TEST_CASE_SIG("bloom_filter device ref cooperative clear", + "", + ((int32_t CGSize, class Key, class Policy), CGSize, Key, Policy), + (1, int32_t, cuco::default_filter_policy), + (4, int32_t, cuco::default_filter_policy), + (8, int32_t, cuco::default_filter_policy), + (32, int32_t, cuco::default_filter_policy)) { using filter_type = cuco::bloom_filter, cuda::thread_scope_device, Policy>; constexpr size_type num_keys{400}; - uint32_t pattern_bits = Policy::words_per_block + GENERATE(0, 1, 2, 3, 4); + auto filter = filter_type{1000}; - // some parameter combinations might be invalid so we skip them - try { - [[maybe_unused]] auto policy = Policy{pattern_bits}; - } catch (std::exception const& e) { - SKIP(e.what()); - } + thrust::device_vector keys(num_keys); + thrust::sequence(thrust::device, keys.begin(), keys.end()); + filter.add(keys.begin(), keys.end()); + + thrust::device_vector contained(num_keys, false); + filter.contains(keys.begin(), keys.end(), contained.begin()); + REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{})); - auto filter = filter_type{1000, {}, {pattern_bits}}; + // Device cooperative clear via a single tile that iterates over all filter words. + cooperative_clear_kernel<<<1, CGSize>>>(filter.ref()); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); - test_variable_cg_size(filter, num_keys); + filter.contains(keys.begin(), keys.end(), contained.begin()); + REQUIRE(cuco::test::none_of(contained.begin(), contained.end(), cuda::std::identity{})); }