diff --git a/benchmarks/benchmark_utils.hpp b/benchmarks/benchmark_utils.hpp
index e0acc0bb9..2f02ec3b0 100644
--- a/benchmarks/benchmark_utils.hpp
+++ b/benchmarks/benchmark_utils.hpp
@@ -18,6 +18,7 @@
 
 #include <cuco/detail/__config>
 #include <cuco/detail/error.hpp>
+#include <cuco/hash_functions.cuh>
 #include <cuco/utility/key_generator.cuh>
 
 #include <nvbench/nvbench.cuh>
@@ -65,7 +66,9 @@ struct lazy_discard {
   __device__ void device_dispatch(index_type index, value_type const& value) const
   {
     // pick some predicate that is always false, but depends on the runtime value
-    if (threadIdx.x > 2025 + *reinterpret_cast<char const*>(&value)) { *(it + index) = value; }
+    if (threadIdx.x > 2025 + *reinterpret_cast<char const*>(&value) + static_cast<int>(index)) {
+      *(it + index) = value;
+    }
   }
   __host__ __device__ void operator()(index_type index, value_type const& value) const
   {
@@ -94,6 +97,21 @@ NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::gaussian,
                              "GAUSSIAN",
                              "distribution::gaussian");
 
+NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::XXHash_64<char>, "xxhash_64", "cuco::xxhash_64");
+NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::XXHash_32<char>, "xxhash_32", "cuco::xxhash_32");
+NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::MurmurHash3_32<char>,
+                             "murmurhash3_32",
+                             "cuco::murmurhash3_32");
+NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::MurmurHash3_x86_128<char>,
+                             "murmurhash3_x86_128",
+                             "cuco::murmurhash3_x86_128");
+NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::MurmurHash3_x64_128<char>,
+                             "murmurhash3_x64_128",
+                             "cuco::murmurhash3_x64_128");
+NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::identity_hash<char>,
+                             "identity_hash",
+                             "cuco::identity_hash");
+
 #if defined(CUCO_HAS_128BIT_ATOMICS)
 NVBENCH_DECLARE_TYPE_STRINGS(__int128_t, "I128", "__int128_t");
 #endif
diff --git a/benchmarks/bloom_filter/add_bench.cu b/benchmarks/bloom_filter/add_bench.cu
index b07c285d4..28e1c4182 100644
--- a/benchmarks/bloom_filter/add_bench.cu
+++ b/benchmarks/bloom_filter/add_bench.cu
@@ -15,7 +15,6 @@
  */
 
 #include "defaults.hpp"
-#include "utils.hpp"
 
 #include <benchmark_defaults.hpp>
 #include <benchmark_utils.hpp>
@@ -24,143 +23,121 @@
 
 #include <nvbench/nvbench.cuh>
 
-#include <cuda/iterator>
 #include <cuda/std/limits>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
 
 #include <cstdint>
 #include <exception>
 #include <limits>
 
-using namespace cuco::benchmark;  // defaults, dist_from_state, rebind_hasher_t, add_fpr_summary
+using namespace cuco::benchmark;  // defaults, dist_from_state, rebind_hasher_t
 using namespace cuco::utility;    // key_generator, distribution
 
 /**
  * @brief A benchmark evaluating `cuco::bloom_filter::add_async` performance
  */
-template <typename Key, typename Hash, typename Word, nvbench::int32_t WordsPerBlock, typename Dist>
+template <typename Key,
+          typename Word,
+          nvbench::int32_t BlockBits,
+          nvbench::int32_t PatternBits,
+          nvbench::int32_t HorizontalLayout,
+          nvbench::int32_t VerticalLayout>
 void bloom_filter_add(nvbench::state& state,
-                      nvbench::type_list<Key, Hash, Word, nvbench::enum_type<WordsPerBlock>, Dist>)
+                      nvbench::type_list<Key,
+                                         Word,
+                                         nvbench::enum_type<BlockBits>,
+                                         nvbench::enum_type<PatternBits>,
+                                         nvbench::enum_type<HorizontalLayout>,
+                                         nvbench::enum_type<VerticalLayout>>)
 {
-  using size_type   = std::uint32_t;
-  using policy_type = cuco::default_filter_policy<rebind_hasher_t<Hash, Key>,
-                                                  Word,
-                                                  static_cast<std::uint32_t>(WordsPerBlock)>;
-  using filter_type =
-    cuco::bloom_filter<Key, cuco::extent<size_type>, cuda::thread_scope_device, policy_type>;
-
-  constexpr auto filter_block_size =
-    sizeof(typename filter_type::word_type) * filter_type::words_per_block;
-
-  auto const num_keys       = state.get_int64("NumInputs");
-  auto const filter_size_mb = state.get_int64("FilterSizeMB");
-  auto const pattern_bits   = WordsPerBlock;
-
-  try {
-    [[maybe_unused]] auto const policy = policy_type{static_cast<uint32_t>(pattern_bits)};
-  } catch (std::exception const& e) {
-    state.skip(e.what());  // skip invalid configurations
+  auto constexpr words_per_block       = BlockBits / cuda::std::numeric_limits<Word>::digits;
+  auto constexpr pattern_bits_per_word = PatternBits / words_per_block;
+
+  // Check for a valid configuration
+  if constexpr ((not cuda::std::has_single_bit(static_cast<uint32_t>(BlockBits))) or
+                (words_per_block == 0)) {
+    state.skip("Invalid filter block size");
+  } else if constexpr (HorizontalLayout * VerticalLayout != words_per_block) {
+    state.skip("Invalid vectorization layout");
+  } else if constexpr ((pattern_bits_per_word <= 0) or
+                       (pattern_bits_per_word > cuda::std::numeric_limits<Word>::digits) or
+                       (pattern_bits_per_word * words_per_block > 64)) {
+    state.skip("Invalid pattern bits per word");
+  } else {
+    using size_type                           = std::uint32_t;
+    using hasher                              = cuco::xxhash_64<Key>;
+    auto constexpr contains_vertical_layout   = words_per_block;
+    auto constexpr contains_horizontal_layout = 1;
+    using policy_type                         = cuco::parametric_filter_policy<hasher,
+                                                                               Word,
+                                                                               words_per_block,
+                                                                               PatternBits,
+                                                                               HorizontalLayout,
+                                                                               VerticalLayout,
+                                                                               contains_horizontal_layout,
+                                                                               contains_vertical_layout>;
+    using filter_type =
+      cuco::bloom_filter<Key, cuco::extent<size_type>, cuda::thread_scope_device, policy_type>;
+
+    auto const num_keys       = state.get_int64("NumInputs");
+    auto const filter_size_mb = state.get_int64("FilterSizeMB");
+
+    std::size_t const num_sub_filters =
+      (filter_size_mb * 1024 * 1024) /
+      (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
+
+    if (num_sub_filters > policy_type::max_filter_blocks) {
+      // skip invalid configurations
+      state.skip("num_sub_filters exceeds max_filter_blocks");
+    }
+
+    state.add_element_count(num_keys);
+
+    filter_type filter{static_cast<size_type>(num_sub_filters)};
+
+    thrust::device_vector<Key> keys(num_keys);
+    thrust::sequence(thrust::device, keys.begin(), keys.end(), 0);
+
+    state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      timer.start();
+      filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
+      timer.stop();
+      filter.clear_async({launch.get_stream()});
+    });
   }
-
-  std::size_t const num_sub_filters = (filter_size_mb * 1024 * 1024) / filter_block_size;
-
-  if (num_sub_filters > std::numeric_limits<size_type>::max()) {
-    state.skip("num_sub_filters too large for size_type");  // skip invalid configurations
-  }
-
-  cuda::counting_iterator<Key> keys(0);
-
-  state.add_element_count(num_keys);
-
-  filter_type filter{
-    static_cast<size_type>(num_sub_filters), {}, {static_cast<std::uint32_t>(pattern_bits)}};
-
-  state.collect_dram_throughput();
-  state.collect_l2_hit_rates();
-
-  add_fpr_summary(state, filter);
-
-  state.exec([&](nvbench::launch& launch) {
-    filter.add_async(keys, keys + num_keys, {launch.get_stream()});
-  });
-}
-
-/**
- * @brief A benchmark evaluating `cuco::bloom_filter::add_async` performance with
- * `arrow_filter_policy`
- */
-template <typename Key, typename Dist>
-void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Dist>)
-{
-  using size_type   = std::uint32_t;
-  using policy_type = cuco::arrow_filter_policy<Key>;
-  using filter_type =
-    cuco::bloom_filter<Key, cuco::extent<size_type>, cuda::thread_scope_device, policy_type>;
-
-  auto const num_keys       = state.get_int64("NumInputs");
-  auto const filter_size_mb = state.get_int64("FilterSizeMB");
-
-  std::size_t const num_sub_filters =
-    (filter_size_mb * 1024 * 1024) /
-    (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
-
-  if (num_sub_filters > policy_type::max_filter_blocks) {
-    state.skip("bloom filter with arrow policy should have <= 4194304 blocks");  // skip invalid
-                                                                                 // configurations
-  }
-
-  cuda::counting_iterator<Key> keys(0);
-
-  state.add_element_count(num_keys);
-
-  filter_type filter{static_cast<size_type>(num_sub_filters)};
-
-  state.collect_dram_throughput();
-  state.collect_l2_hit_rates();
-
-  add_fpr_summary(state, filter);
-
-  state.exec([&](nvbench::launch& launch) {
-    filter.add_async(keys, keys + num_keys, {launch.get_stream()});
-  });
 }
 
+// Default benchmark: single layout matching `cuco::default_filter_policy` (256-bit block,
+// 8-bit fingerprint, fully horizontal add) swept across the standard FilterSizeMB range.
 NVBENCH_BENCH_TYPES(bloom_filter_add,
                     NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
-                                      nvbench::type_list<defaults::BF_HASH>,
-                                      nvbench::type_list<defaults::BF_WORD>,
-                                      nvbench::enum_type_list<defaults::BF_WORDS_PER_BLOCK>,
-                                      nvbench::type_list<distribution::unique>))
+                                      nvbench::type_list<nvbench::uint32_t>,  ///< Word
+                                      nvbench::enum_type_list<256>,           ///< BlockBits
+                                      nvbench::enum_type_list<8>,             ///< PatternBits
+                                      nvbench::enum_type_list<8>,             ///< HorizontalLayout
+                                      nvbench::enum_type_list<1>              ///< VerticalLayout
+                                      ))
   .set_name("bloom_filter_add_unique_size")
-  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
+  .set_type_axes_names(
+    {"Key", "Word", "BlockBits", "PatternBits", "HorizontalLayout", "VerticalLayout"})
   .add_int64_axis("NumInputs", {defaults::BF_N})
   .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE);
 
-NVBENCH_BENCH_TYPES(bloom_filter_add,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
-                                      defaults::HASH_RANGE,
-                                      nvbench::type_list<defaults::BF_WORD>,
-                                      nvbench::enum_type_list<defaults::BF_WORDS_PER_BLOCK>,
-                                      nvbench::type_list<distribution::unique>))
-  .set_name("bloom_filter_add_unique_hash")
-  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
-  .add_int64_axis("NumInputs", {defaults::BF_N})
-  .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB});
-
-NVBENCH_BENCH_TYPES(bloom_filter_add,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
-                                      nvbench::type_list<defaults::BF_HASH>,
-                                      nvbench::type_list<nvbench::uint32_t, nvbench::uint64_t>,
-                                      nvbench::enum_type_list<1, 2, 4, 8>,
-                                      nvbench::type_list<distribution::unique>))
-  .set_name("bloom_filter_add_unique_block_dim")
-  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
-  .add_int64_axis("NumInputs", {defaults::BF_N})
-  .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB});
-
-NVBENCH_BENCH_TYPES(arrow_bloom_filter_add,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
-                                      nvbench::type_list<distribution::unique>))
-  .set_name("arrow_bloom_filter_add_unique_size")
-  .set_type_axes_names({"Key", "Distribution"})
-  .add_int64_axis("NumInputs", {defaults::BF_N})
-  .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE);
+// Exhaustive sweep across block sizes and vectorization layouts. Uncomment for performance
+// tuning / paper-style characterization; not run by default because the matrix is large.
+// NVBENCH_BENCH_TYPES(
+//   bloom_filter_add,
+//   NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
+//                     nvbench::type_list<nvbench::uint64_t, nvbench::uint32_t>, ///< Word
+//                     nvbench::enum_type_list<64, 128, 256, 512, 1024>,         ///< BlockBits
+//                     nvbench::enum_type_list<8, 16>,                           ///< PatternBits
+//                     nvbench::enum_type_list<1, 2, 4, 8, 16>,                  ///<
+//                     HorizontalLayout nvbench::enum_type_list<1, 2, 4, 8, 16> ///< VerticalLayout
+//                     ))
+//   .set_name("bloom_filter_add_full_sweep_u64")
+//   .set_type_axes_names(
+//     {"Key", "Word", "BlockBits", "PatternBits", "HorizontalLayout", "VerticalLayout"})
+//   .add_int64_axis("NumInputs", {defaults::BF_N})
+//   .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE);
\ No newline at end of file
diff --git a/benchmarks/bloom_filter/contains_bench.cu b/benchmarks/bloom_filter/contains_bench.cu
index 7a4549524..ba9c59957 100644
--- a/benchmarks/bloom_filter/contains_bench.cu
+++ b/benchmarks/bloom_filter/contains_bench.cu
@@ -15,7 +15,6 @@
  */
 
 #include "defaults.hpp"
-#include "utils.hpp"
 
 #include <benchmark_defaults.hpp>
 #include <benchmark_utils.hpp>
@@ -24,157 +23,143 @@
 
 #include <nvbench/nvbench.cuh>
 
-#include <cuda/iterator>
 #include <cuda/std/limits>
+#include <thrust/count.h>
 #include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
 
 #include <exception>
 #include <limits>
 
-using namespace cuco::benchmark;  // defaults, dist_from_state, rebind_hasher_t, add_fpr_summary
+using namespace cuco::benchmark;  // defaults, dist_from_state, rebind_hasher_t
 using namespace cuco::utility;    // key_generator, distribution
 
 /**
  * @brief A benchmark evaluating `cuco::bloom_filter::contains_async` performance
  */
-template <typename Key, typename Hash, typename Word, nvbench::int32_t WordsPerBlock, typename Dist>
-void bloom_filter_contains(
-  nvbench::state& state,
-  nvbench::type_list<Key, Hash, Word, nvbench::enum_type<WordsPerBlock>, Dist>)
+template <typename Key,
+          typename Word,
+          nvbench::int32_t BlockBits,
+          nvbench::int32_t PatternBits,
+          nvbench::int32_t HorizontalLayout,
+          nvbench::int32_t VerticalLayout>
+void bloom_filter_contains(nvbench::state& state,
+                           nvbench::type_list<Key,
+                                              Word,
+                                              nvbench::enum_type<BlockBits>,
+                                              nvbench::enum_type<PatternBits>,
+                                              nvbench::enum_type<HorizontalLayout>,
+                                              nvbench::enum_type<VerticalLayout>>)
 {
-  using size_type   = std::uint32_t;
-  using policy_type = cuco::default_filter_policy<rebind_hasher_t<Hash, Key>,
-                                                  Word,
-                                                  static_cast<std::uint32_t>(WordsPerBlock)>;
-  using filter_type =
-    cuco::bloom_filter<Key, cuco::extent<size_type>, cuda::thread_scope_device, policy_type>;
-
-  constexpr auto filter_block_size =
-    sizeof(typename filter_type::word_type) * filter_type::words_per_block;
-
-  // if (filter_block_size <= 32) {
-  //   cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if
-  //   filter block fits into a 32B sector
-  // }
-
-  auto const num_keys       = state.get_int64("NumInputs");
-  auto const filter_size_mb = state.get_int64("FilterSizeMB");
-  auto const pattern_bits   = WordsPerBlock;
-
-  try {
-    [[maybe_unused]] auto const policy = policy_type{static_cast<uint32_t>(pattern_bits)};
-  } catch (std::exception const& e) {
-    state.skip(e.what());  // skip invalid configurations
+  auto constexpr words_per_block       = BlockBits / cuda::std::numeric_limits<Word>::digits;
+  auto constexpr pattern_bits_per_word = PatternBits / words_per_block;
+
+  // Check for a valid configuration
+  if constexpr ((not cuda::std::has_single_bit(static_cast<uint32_t>(BlockBits))) or
+                (words_per_block == 0)) {
+    state.skip("Invalid filter block size");
+  } else if constexpr (HorizontalLayout * VerticalLayout > words_per_block) {
+    state.skip("Invalid vectorization layout");  // TODO check if this is correct
+  } else if constexpr ((pattern_bits_per_word <= 0) or
+                       (pattern_bits_per_word > cuda::std::numeric_limits<Word>::digits) or
+                       (pattern_bits_per_word * words_per_block > 64)) {
+    state.skip("Invalid pattern bits per word");
+  } else {
+    using size_type                      = std::uint32_t;
+    using hasher                         = cuco::xxhash_64<Key>;
+    auto constexpr add_vertical_layout   = 1;
+    auto constexpr add_horizontal_layout = words_per_block;
+    using policy_type                    = cuco::parametric_filter_policy<hasher,
+                                                                          Word,
+                                                                          words_per_block,
+                                                                          PatternBits,
+                                                                          add_horizontal_layout,
+                                                                          add_vertical_layout,
+                                                                          HorizontalLayout,
+                                                                          VerticalLayout>;
+    using filter_type =
+      cuco::bloom_filter<Key, cuco::extent<size_type>, cuda::thread_scope_device, policy_type>;
+
+    auto const num_keys       = state.get_int64("NumInputs");
+    auto const filter_size_mb = state.get_int64("FilterSizeMB");
+
+    std::size_t const num_sub_filters =
+      (filter_size_mb * 1024 * 1024) /
+      (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
+
+    if (num_sub_filters > policy_type::max_filter_blocks) {
+      // skip invalid configurations
+      state.skip("num_sub_filters exceeds max_filter_blocks");
+    }
+
+    state.add_element_count(num_keys);
+
+    filter_type filter{static_cast<size_type>(num_sub_filters)};
+
+    thrust::counting_iterator<Key> key_it(0);
+
+    // insert FPR-optimal number of keys
+    auto const num_build_keys = (filter_size_mb * 1024 * 1024 * 8) / (2 * PatternBits);
+    filter.add(key_it, key_it + num_build_keys);
+
+    // FPR summary
+    thrust::device_vector<bool> result(num_keys, false);
+    filter.contains(key_it + num_build_keys, key_it + num_build_keys + num_keys, result.begin());
+
+    double const fp = thrust::count(thrust::device, result.begin(), result.end(), true);
+
+    auto& summ_fpr = state.add_summary("FalsePositiveRate");
+    summ_fpr.set_string("hint", "FPR");
+    summ_fpr.set_string("short_name", "FPR");
+    summ_fpr.set_string("description", "False-positive rate of the bloom filter.");
+    summ_fpr.set_float64("value", fp / static_cast<double>(num_keys));
+
+    state.collect_dram_throughput();
+    state.collect_l2_hit_rates();
+
+    thrust::device_vector<Key> keys(num_keys);
+    thrust::sequence(thrust::device, keys.begin(), keys.end(), 0);
+
+    state.add_global_memory_reads<char>(num_keys *
+                                        ((words_per_block * sizeof(Word)) + sizeof(Key)));
+    state.add_global_memory_writes<char>(num_keys * sizeof(bool));
+
+    state.exec([&](nvbench::launch& launch) {
+      filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
+    });
   }
-
-  std::size_t const num_sub_filters = (filter_size_mb * 1024 * 1024) / filter_block_size;
-
-  if (num_sub_filters > std::numeric_limits<size_type>::max()) {
-    state.skip("num_sub_filters too large for size_type");  // skip invalid configurations
-  }
-
-  cuda::counting_iterator<Key> keys(0);
-  thrust::device_vector<bool> result(num_keys, false);
-
-  state.add_element_count(num_keys);
-
-  filter_type filter{
-    static_cast<size_type>(num_sub_filters), {}, {static_cast<uint32_t>(pattern_bits)}};
-
-  state.collect_dram_throughput();
-  state.collect_l2_hit_rates();
-
-  add_fpr_summary(state, filter);
-
-  filter.add(keys, keys + num_keys);
-
-  state.exec([&](nvbench::launch& launch) {
-    filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()});
-  });
-}
-
-/**
- * @brief A benchmark evaluating `cuco::bloom_filter::contains_async` performance with
- * `arrow_filter_policy`
- */
-template <typename Key, typename Dist>
-void arrow_bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key, Dist>)
-{
-  // cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if
-  // filter block fits into a 32B sector
-  using size_type   = std::uint32_t;
-  using policy_type = cuco::arrow_filter_policy<Key>;
-  using filter_type =
-    cuco::bloom_filter<Key, cuco::extent<size_type>, cuda::thread_scope_device, policy_type>;
-
-  auto const num_keys       = state.get_int64("NumInputs");
-  auto const filter_size_mb = state.get_int64("FilterSizeMB");
-
-  std::size_t const num_sub_filters =
-    (filter_size_mb * 1024 * 1024) /
-    (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
-
-  if (num_sub_filters > policy_type::max_filter_blocks) {
-    state.skip("bloom filter with arrow policy should have <= 4194304 blocks");  // skip invalid
-                                                                                 // configurations
-  }
-
-  cuda::counting_iterator<Key> keys(0);
-  thrust::device_vector<bool> result(num_keys, false);
-
-  state.add_element_count(num_keys);
-
-  filter_type filter{static_cast<size_type>(num_sub_filters)};
-
-  state.collect_dram_throughput();
-  state.collect_l2_hit_rates();
-
-  add_fpr_summary(state, filter);
-
-  filter.add(keys, keys + num_keys);
-
-  state.exec([&](nvbench::launch& launch) {
-    filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()});
-  });
 }
 
+// Default benchmark: single layout matching `cuco::default_filter_policy` (256-bit block,
+// 8-bit fingerprint, fully vertical contains) swept across the standard FilterSizeMB range.
 NVBENCH_BENCH_TYPES(bloom_filter_contains,
                     NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
-                                      nvbench::type_list<defaults::BF_HASH>,
-                                      nvbench::type_list<defaults::BF_WORD>,
-                                      nvbench::enum_type_list<defaults::BF_WORDS_PER_BLOCK>,
-                                      nvbench::type_list<distribution::unique>))
+                                      nvbench::type_list<nvbench::uint32_t>,  ///< Word
+                                      nvbench::enum_type_list<256>,           ///< BlockBits
+                                      nvbench::enum_type_list<8>,             ///< PatternBits
+                                      nvbench::enum_type_list<1>,             ///< HorizontalLayout
+                                      nvbench::enum_type_list<8>              ///< VerticalLayout
+                                      ))
   .set_name("bloom_filter_contains_unique_size")
-  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
+  .set_type_axes_names(
+    {"Key", "Word", "BlockBits", "PatternBits", "HorizontalLayout", "VerticalLayout"})
   .add_int64_axis("NumInputs", {defaults::BF_N})
   .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE);
 
-NVBENCH_BENCH_TYPES(bloom_filter_contains,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
-                                      defaults::HASH_RANGE,
-                                      nvbench::type_list<defaults::BF_WORD>,
-                                      nvbench::enum_type_list<defaults::BF_WORDS_PER_BLOCK>,
-                                      nvbench::type_list<distribution::unique>))
-  .set_name("bloom_filter_contains_unique_hash")
-  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
-  .add_int64_axis("NumInputs", {defaults::BF_N})
-  .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB});
-
-NVBENCH_BENCH_TYPES(bloom_filter_contains,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
-                                      nvbench::type_list<defaults::BF_HASH>,
-                                      nvbench::type_list<nvbench::uint32_t, nvbench::uint64_t>,
-                                      nvbench::enum_type_list<1, 2, 4, 8>,
-                                      nvbench::type_list<distribution::unique>))
-  .set_name("bloom_filter_contains_unique_block_dim")
-  .set_type_axes_names({"Key", "Hash", "Word", "WordsPerBlock", "Distribution"})
-  .add_int64_axis("NumInputs", {defaults::BF_N})
-  .add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB});
-
-NVBENCH_BENCH_TYPES(arrow_bloom_filter_contains,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
-                                      nvbench::type_list<distribution::unique>))
-  .set_name("arrow_bloom_filter_contains_unique_size")
-  .set_type_axes_names({"Key", "Distribution"})
-  .add_int64_axis("NumInputs", {defaults::BF_N})
-  .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE);
+// Exhaustive sweep across block sizes and vectorization layouts. Uncomment for performance
+// tuning / paper-style characterization; not run by default because the matrix is large.
+// NVBENCH_BENCH_TYPES(
+//   bloom_filter_contains,
+//   NVBENCH_TYPE_AXES(nvbench::type_list<defaults::BF_KEY>,
+//                     nvbench::type_list<nvbench::uint64_t, nvbench::uint32_t>, ///< Word
+//                     nvbench::enum_type_list<64, 128, 256, 512, 1024>,         ///< BlockBits
+//                     nvbench::enum_type_list<8, 16>,                           ///< PatternBits
+//                     nvbench::enum_type_list<1, 2, 4, 8, 16>,                  ///<
+//                     HorizontalLayout nvbench::enum_type_list<1, 2, 4, 8, 16> ///< VerticalLayout
+//                     ))
+//   .set_name("bloom_filter_contains_full_sweep_u64")
+//   .set_type_axes_names(
+//     {"Key", "Word", "BlockBits", "PatternBits", "HorizontalLayout", "VerticalLayout"})
+//   .add_int64_axis("NumInputs", {defaults::BF_N})
+//   .add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE);
\ No newline at end of file
diff --git a/benchmarks/bloom_filter/defaults.hpp b/benchmarks/bloom_filter/defaults.hpp
index f1b192aa3..75d0c6ac6 100644
--- a/benchmarks/bloom_filter/defaults.hpp
+++ b/benchmarks/bloom_filter/defaults.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,26 +16,17 @@
 
 #pragma once
 
-#include <cuco/hash_functions.cuh>
-
 #include <nvbench/nvbench.cuh>
 
-#include <cuda/std/array>
-
 #include <vector>
 
 namespace cuco::benchmark::defaults {
 
-using BF_KEY  = nvbench::int64_t;
-using BF_HASH = cuco::xxhash_64<char>;
-using BF_WORD = nvbench::uint32_t;
+using BF_KEY = nvbench::int64_t;
 
-static constexpr auto BF_N               = 1'000'000'000;
-static constexpr auto BF_SIZE_MB         = 2'000;
-static constexpr auto BF_WORDS_PER_BLOCK = 8;
+static constexpr auto BF_N = 1'000'000'000;
 
 auto const BF_SIZE_MB_RANGE_CACHE =
   std::vector<nvbench::int64_t>{1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048};
-auto const BF_PATTERN_BITS_RANGE = std::vector<nvbench::int64_t>{1, 2, 4, 6, 8, 16};
 
 }  // namespace cuco::benchmark::defaults
diff --git a/benchmarks/bloom_filter/utils.hpp b/benchmarks/bloom_filter/utils.hpp
index cec7e06c3..9273d8fcc 100644
--- a/benchmarks/bloom_filter/utils.hpp
+++ b/benchmarks/bloom_filter/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,21 +27,6 @@
 
 #include <cstdint>
 
-NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::XXHash_64<char>, "xxhash_64", "cuco::xxhash_64");
-NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::XXHash_32<char>, "xxhash_32", "cuco::xxhash_32");
-NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::MurmurHash3_32<char>,
-                             "murmurhash3_32",
-                             "cuco::murmurhash3_32");
-NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::MurmurHash3_x86_128<char>,
-                             "murmurhash3_x86_128",
-                             "cuco::murmurhash3_x86_128");
-NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::MurmurHash3_x64_128<char>,
-                             "murmurhash3_x64_128",
-                             "cuco::murmurhash3_x64_128");
-NVBENCH_DECLARE_TYPE_STRINGS(cuco::detail::identity_hash<char>,
-                             "identity_hash",
-                             "cuco::identity_hash");
-
 namespace cuco::benchmark {
 
 template <typename FilterType>
diff --git a/include/cuco/bloom_filter.cuh b/include/cuco/bloom_filter.cuh
index 3f2e69c7c..c3344917d 100644
--- a/include/cuco/bloom_filter.cuh
+++ b/include/cuco/bloom_filter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,13 +36,13 @@
 namespace cuco {
 
 /**
- * @brief A GPU-accelerated Blocked Bloom Filter.
+ * @brief A GPU-accelerated Bloom filter.
  *
  * The `bloom_filter` supports two types of operations:
  * - Host-side "bulk" operations
  * - Device-side "singular" operations
  *
- * The host-side bulk operations include add(), contains(), etc. These APIs should be used when
+ * The host-side bulk operations include `add()`, `contains()`, etc. These APIs should be used when
  * there are a large number of keys to add or lookup. For example, given a range of keys
  * specified by device-accessible iterators, the bulk `add` function will add all keys into
  * the filter.
@@ -61,8 +61,8 @@ namespace cuco {
 template <class Key,
           class Extent             = cuco::extent<std::size_t>,
           cuda::thread_scope Scope = cuda::thread_scope_device,
-          class Policy    = cuco::default_filter_policy<cuco::xxhash_64<Key>, std::uint32_t, 8>,
-          class Allocator = cuco::cuda_allocator<cuda::std::byte>>
+          class Policy             = cuco::default_filter_policy<Key>,
+          class Allocator          = cuco::cuda_allocator<cuda::std::byte>>
 class bloom_filter {
  public:
   /**
@@ -104,21 +104,17 @@ class bloom_filter {
   /**
    * @brief Constructs a statically-sized Bloom filter.
    *
-   * @note The total number of bits in the filter is determined by `words_per_block * num_blocks *
-   * sizeof(word_type) * CHAR_BIT`.
-   *
    * @param num_blocks Number of sub-filters or blocks
    * @param scope The scope in which operations will be performed
    * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policies.cuh`)
    * @param alloc Allocator used for allocating device-accessible storage
    * @param stream CUDA stream used to initialize the filter
    */
-  __host__ explicit constexpr bloom_filter(Extent num_blocks,
-                                           cuda_thread_scope<Scope> scope = {},
-                                           Policy const& policy           = {},
-                                           Allocator const& alloc         = {},
-                                           cuda::stream_ref stream        = cuda::stream_ref{
-                                             cudaStream_t{nullptr}});
+  __host__ explicit bloom_filter(Extent num_blocks,
+                                 cuda_thread_scope<Scope> scope = {},
+                                 Policy const& policy           = {},
+                                 Allocator const& alloc         = {},
+                                 cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
 
   /**
    * @brief Erases all information from the filter.
@@ -164,8 +160,10 @@ class bloom_filter {
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
   template <class InputIt>
-  __host__ constexpr void add_async(
-    InputIt first, InputIt last, cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
+  __host__ constexpr void add_async(InputIt first,
+                                    InputIt last,
+                                    cuda::stream_ref stream = cuda::stream_ref{
+                                      cudaStream_t{nullptr}}) noexcept;
 
   /**
    * @brief Adds keys in the range `[first, last)` if `pred` of the corresponding `stencil` returns
@@ -326,7 +324,7 @@ class bloom_filter {
                                               cudaStream_t{nullptr}}) const noexcept;
 
   /**
-   * @brief Merge another bloom filter into this.
+   * @brief Merge another bloom filter into `*this`.
    *
    * @note Modifies `this` in place.
    * @note This function synchronizes the given stream. For asynchronous execution use
@@ -346,7 +344,7 @@ class bloom_filter {
                                 cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
 
   /**
-   * @brief Asynchronously merge another bloom filter into this.
+   * @brief Asynchronously merge another bloom filter into `*this`.
    *
    * @note Modifies `this` in place.
    *
@@ -365,7 +363,7 @@ class bloom_filter {
     cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
 
   /**
-   * @brief Intersect another bloom filter into this.
+   * @brief Intersect another bloom filter into `*this`.
    *
    * @note Modifies `this` in place.
    * @note This function synchronizes the given stream. For asynchronous execution use
@@ -389,7 +387,7 @@ class bloom_filter {
     cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
 
   /**
-   * @brief Asynchronously intersect another bloom filter into this.
+   * @brief Asynchronously intersect another bloom filter into `*this`.
    *
    * @note Modifies `this` in place.
    *
diff --git a/include/cuco/bloom_filter_policies.cuh b/include/cuco/bloom_filter_policies.cuh
index 0d28b166a..f2f1c0e73 100644
--- a/include/cuco/bloom_filter_policies.cuh
+++ b/include/cuco/bloom_filter_policies.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,7 @@
 
 #pragma once
 
-#include <cuco/detail/bloom_filter/arrow_filter_policy.cuh>
-#include <cuco/detail/bloom_filter/default_filter_policy_impl.cuh>
+#include <cuco/detail/bloom_filter/parametric_filter_policy.cuh>
 #include <cuco/hash_functions.cuh>
 
 #include <cstdint>
@@ -25,110 +24,52 @@
 namespace cuco {
 
 /**
- * @brief A policy that defines how Arrow Block-Split Bloom Filter generates and stores a key's
- * fingerprint.
+ * @brief Sectorized Bloom filter policy with multiplicative-hashing fingerprint generation.
  *
- * @tparam Key The type of the values to generate a fingerprint for.
- * @tparam XXHash64 Custom (64 bit) XXHash hasher to generate a key's fingerprint.
- * By default, cuco::xxhash_64 hasher will be used.
+ * Implements the Sectorized Bloom Filter (SBF) variant from "Optimizing Bloom Filters for Modern
+ * GPU Architectures" (arXiv:2512.15595).
  *
+ * Requires a 64-bit hash function: the result is split into upper 32 bits (block selection via
+ * multiply-shift) and lower 32 bits (pattern generation).
+ *
+ * @tparam Hash 64-bit hash functor.
+ * @tparam Word Underlying word type of a filter block.
+ * @tparam WordsPerBlock Words per filter block.
+ * @tparam PatternBits Fingerprint bits per key (paper's k).
+ * @tparam AddHorizontalLayout CG size for add (paper's Theta).
+ * @tparam AddVerticalLayout Words per thread per add step (paper's Phi).
+ * @tparam ContainsHorizontalLayout CG size for contains.
+ * @tparam ContainsVerticalLayout Words per thread per contains step.
  */
-template <typename Key, template <typename> class XXHash64 = cuco::xxhash_64>
-using arrow_filter_policy = detail::arrow_filter_policy<Key, XXHash64>;
+template <class Hash,
+          class Word,
+          std::uint32_t WordsPerBlock,
+          std::uint32_t PatternBits,
+          std::uint32_t AddHorizontalLayout,
+          std::uint32_t AddVerticalLayout,
+          std::uint32_t ContainsHorizontalLayout,
+          std::uint32_t ContainsVerticalLayout>
+using parametric_filter_policy = detail::parametric_filter_policy<Hash,
+                                                                  Word,
+                                                                  WordsPerBlock,
+                                                                  PatternBits,
+                                                                  AddHorizontalLayout,
+                                                                  AddVerticalLayout,
+                                                                  ContainsHorizontalLayout,
+                                                                  ContainsVerticalLayout>;
 
 /**
- * @brief The default policy that defines how a Blocked Bloom Filter generates and stores a key's
- * fingerprint.
+ * @brief Default Bloom filter policy used by `cuco::bloom_filter` when no policy is specified.
  *
- * @note `Word` type must be an atomically updatable integral type. `WordsPerBlock` must
- * be a power-of-two.
+ * Alias for a `parametric_filter_policy` instantiation with paper-recommended layouts on a 256-bit
+ * block: 8 x `uint32_t` words, 8 fingerprint bits per key, fully horizontal add (Theta=8) and fully
+ * vertical contains (Phi=8).
  *
- * @tparam Hash Hash function used to generate a key's fingerprint
- * @tparam Word Underlying word/segment type of a filter block
- * @tparam WordsPerBlock Number of words/segments in each block
+ * @tparam Key The key type to generate a fingerprint for.
+ * @tparam XXHash64 64-bit XXHash functor template. Defaults to `cuco::xxhash_64`.
  */
-template <class Hash, class Word, std::uint32_t WordsPerBlock>
-class default_filter_policy {
-  using impl_type = cuco::detail::default_filter_policy_impl<Hash, Word, WordsPerBlock>;
-
- public:
-  using hasher             = typename impl_type::hasher;              ///< Type of the hash function
-  using hash_argument_type = typename impl_type::hash_argument_type;  ///< Hash function input type
-  using hash_result_type   = typename impl_type::hash_result_type;    ///< hash function output type
-  using word_type =
-    typename impl_type::word_type;  ///< Underlying word/segment type of a filter block
-
-  static constexpr std::uint32_t words_per_block =
-    impl_type::words_per_block;  ///< Number of words/segments in each filter block
-
- public:
-  /**
-   * @brief Constructs the `default_filter_policy` object.
-   *
-   * @throws Compile-time error if the specified number of words in a filter block is not a
-   * power-of-two or is larger than 32. If called from host: throws exception; If called from
-   * device: Traps the kernel.
-   *
-   * @throws If the `hash_result_type` is too narrow to generate the requested number of
-   * `pattern_bits`. If called from host: throws exception; If called from device: Traps the kernel.
-   *
-   * @throws If `pattern_bits` is smaller than the number of words in a filter block or larger than
-   * the total number of bits in a filter block. If called from host: throws exception; If called
-   * from device: Traps the kernel.
-   *
-   * @param pattern_bits Number of bits in a key's fingerprint
-   * @param hash Hash function used to generate a key's fingerprint
-   */
-  __host__ __device__ constexpr default_filter_policy(std::uint32_t pattern_bits = words_per_block,
-                                                      Hash hash                  = {});
-
-  /**
-   * @brief Generates the hash value for a given key.
-   *
-   * @note This function is meant as a customization point and is only used in the internals of the
-   * `bloom_filter(_ref)` implementation.
-   *
-   * @param key The key to hash
-   *
-   * @return The hash value of the key
-   */
-  __device__ constexpr hash_result_type hash(hash_argument_type const& key) const;
-
-  /**
-   * @brief Determines the filter block a key is added into.
-   *
-   * @note This function is meant as a customization point and is only used in the internals of the
-   * `bloom_filter(_ref)` implementation.
-   *
-   * @tparam Extent Size type that is used to determine the number of blocks in the filter
-   *
-   * @param hash Hash value of the key
-   * @param num_blocks Number of block in the filter
-   *
-   * @return The block index for the given key's hash value
-   */
-  template <class Extent>
-  __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const;
-
-  /**
-   * @brief Determines the fingerprint pattern for a word/segment within the filter block for a
-   * given key's hash value.
-   *
-   * @note This function is meant as a customization point and is only used in the internals of the
-   * `bloom_filter(_ref)` implementation.
-   *
-   * @param hash Hash value of the key
-   * @param word_index Target word/segment within the filter block
-   *
-   * @return The bit pattern for the word/segment in the filter block
-   */
-  __device__ constexpr word_type word_pattern(hash_result_type hash,
-                                              std::uint32_t word_index) const;
-
- private:
-  impl_type impl_;  ///< Policy implementation
-};
+template <class Key, template <typename> class XXHash64 = cuco::xxhash_64>
+using default_filter_policy =
+  parametric_filter_policy<XXHash64<Key>, std::uint32_t, 8, 8, 8, 1, 1, 8>;
 
 }  // namespace cuco
-
-#include <cuco/detail/bloom_filter/default_filter_policy.inl>
\ No newline at end of file
diff --git a/include/cuco/bloom_filter_ref.cuh b/include/cuco/bloom_filter_ref.cuh
index f58910afc..bed199919 100644
--- a/include/cuco/bloom_filter_ref.cuh
+++ b/include/cuco/bloom_filter_ref.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,10 +63,10 @@ class bloom_filter_ref {
    * @param scope The scope in which operations will be performed
    * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policies.cuh`)
    */
-  __host__ __device__ explicit constexpr bloom_filter_ref(filter_block_type* data,
-                                                          Extent num_blocks,
-                                                          cuda_thread_scope<Scope> scope,
-                                                          Policy const& policy);
+  __host__ __device__ explicit bloom_filter_ref(filter_block_type* data,
+                                                Extent num_blocks,
+                                                cuda_thread_scope<Scope> scope,
+                                                Policy const& policy);
 
   /**
    * @brief Constructs the ref object from existing storage.
@@ -78,10 +78,10 @@ class bloom_filter_ref {
    * @param scope The scope in which operations will be performed
    * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policies.cuh`)
    */
-  __host__ __device__ explicit constexpr bloom_filter_ref(word_type* data,
-                                                          Extent num_blocks,
-                                                          cuda_thread_scope<Scope> scope,
-                                                          Policy const& policy);
+  __host__ __device__ explicit bloom_filter_ref(word_type* data,
+                                                Extent num_blocks,
+                                                cuda_thread_scope<Scope> scope,
+                                                Policy const& policy);
 
   /**
    * @brief Device function that cooperatively erases all information from the filter.
@@ -178,8 +178,10 @@ class bloom_filter_ref {
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
   template <class InputIt>
-  __host__ constexpr void add_async(
-    InputIt first, InputIt last, cuda::stream_ref stream = cuda::stream_ref{cudaStream_t{nullptr}});
+  __host__ constexpr void add_async(InputIt first,
+                                    InputIt last,
+                                    cuda::stream_ref stream = cuda::stream_ref{
+                                      cudaStream_t{nullptr}}) noexcept;
 
   /**
    * @brief Adds keys in the range `[first, last)` if `pred` of the corresponding `stencil` returns
@@ -265,10 +267,21 @@ class bloom_filter_ref {
   template <class CG, class ProbeKey>
   [[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const;
 
-  // TODO
-  // template <class CG, class InputIt, class OutputIt>
-  // __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin)
-  // const;
+  /**
+   * @brief Device function that tests if all keys in the range `[first, last)` are present in the
+   * filter.
+   *
+   * @tparam CG Cooperative Group type
+   * @tparam InputIt Device-accessible random access input key iterator
+   * @tparam OutputIt Device-accessible output iterator assignable from `bool`
+   *
+   * @param group The Cooperative Group this operation is executed with
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   */
+  template <class CG, class InputIt, class OutputIt>
+  __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin) const;
 
   /**
    * @brief Tests all keys in the range `[first, last)` if their fingerprints are present in the
diff --git a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
deleted file mode 100644
index 481293335..000000000
--- a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cuco/hash_functions.cuh>
-
-#include <cuda/functional>
-#include <cuda/std/bit>
-#include <cuda/std/limits>
-
-#include <cstdint>
-#include <nv/target>
-
-namespace cuco::detail {
-
-/**
- * @brief A policy that defines how Arrow Block-Split Bloom Filter generates and stores a key's
- * fingerprint.
- *
- * Reference:
- * https://github.com/apache/arrow/blob/be1dcdb96b030639c0b56955c4c62f9d6b03f473/cpp/src/parquet/bloom_filter.cc#L219-L230
- *
- * Example:
- * @code{.cpp}
- * template <typename KeyType, std::uint32_t NUM_FILTER_BLOCKS>
- * void bulk_insert_and_eval_arrow_policy_bloom_filter(device_vector<KeyType> const& positive_keys,
- *                                                 device_vector<KeyType> const& negative_keys)
- * {
- *     using policy_type = cuco::arrow_filter_policy<KeyType, cuco::xxhash_64>;
- *
- *     // Warn or throw if the number of filter blocks is greater than maximum used by Arrow policy.
- *     static_assert(NUM_FILTER_BLOCKS <= policy_type::max_filter_blocks, "NUM_FILTER_BLOCKS must be
- *                                                                         in range: [1, 4194304]");
- *
- *     // Create a bloom filter with Arrow policy
- *     cuco::bloom_filter<KeyType, cuco::extent<size_t>,
- *         cuda::thread_scope_device, policy_type> filter{NUM_FILTER_BLOCKS};
- *
- *     // Add positive keys to the bloom filter
- *     filter.add(positive_keys.begin(), positive_keys.end());
- *
- *     auto const num_tp = positive_keys.size();
- *     auto const num_tn = negative_keys.size();
- *
- *     // Vectors to store query results.
- *     thrust::device_vector<bool> true_positive_result(num_tp, false);
- *     thrust::device_vector<bool> true_negative_result(num_tn, false);
- *
- *     // Query the bloom filter for the inserted keys.
- *     filter.contains(positive_keys.begin(), positive_keys.end(), true_positive_result.begin());
- *
- *     // We should see a true-positive rate of 1.
- *     float true_positive_rate = float(thrust::count(thrust::device,
- *          true_positive_result.begin(), true_positive_result.end(), true)) / float(num_tp);
- *
- *     // Query the bloom filter for the non-inserted keys.
- *     filter.contains(negative_keys.begin(), negative_keys.end(), true_negative_result.begin());
- *
- *     // We may see a false-positive rate > 0 depending on the number of bits in the
- *     // filter and the number of hashes used per key.
- *     float false_positive_rate = float(thrust::count(thrust::device,
- *          true_negative_result.begin(), true_negative_result.end(), true)) / float(num_tn);
- * }
- * @endcode
- *
- * @tparam Key The type of the values to generate a fingerprint for.
- * @tparam XXHash64 64-bit XXHash hasher implementation for fingerprint generation.
- */
-template <class Key, template <typename> class XXHash64>
-class arrow_filter_policy {
- public:
-  using hasher           = XXHash64<Key>;  ///< 64-bit XXHash hasher for Arrow bloom filter policy
-  using word_type        = std::uint32_t;  ///< uint32_t for Arrow bloom filter policy
-  using key_type         = Key;            ///< Hash function input type
-  using hash_result_type = std::uint64_t;  ///< hash function output type
-
-  static constexpr uint32_t bits_set_per_block = 8;  ///< hardcoded bits set per Arrow filter block
-  static constexpr uint32_t words_per_block    = 8;  ///< hardcoded words per Arrow filter block
-
-  static constexpr std::uint32_t bytes_per_filter_block =
-    32;  ///< Number of bytes in one Arrow filter block
-  static constexpr std::uint32_t max_arrow_filter_bytes =
-    128 * 1024 * 1024;  ///< Max bytes in Arrow bloom filter
-  static constexpr std::uint32_t max_filter_blocks =
-    (max_arrow_filter_bytes /
-     bytes_per_filter_block);  ///< Max sub-filter blocks allowed in Arrow bloom filter
-
- public:
-  /**
-   * @brief Constructs the `arrow_filter_policy` object.
-   *
-   * @note The number of filter blocks with Arrow policy must be in the
-   * range of [1, 4194304]. If the bloom filter is constructed with a larger
-   * number of blocks, only the first 4194304 (128MB) blocks will be used.
-   *
-   * @param hash Hash function used to generate a key's fingerprint
-   */
-  __host__ __device__ constexpr arrow_filter_policy(hasher hash = {}) : hash_{hash} {}
-
-  /**
-   * @brief Generates the hash value for a given key.
-   *
-   * @param key The key to hash
-   *
-   * @return The hash value of the key
-   */
-  __device__ constexpr hash_result_type hash(key_type const& key) const { return hash_(key); }
-
-  /**
-   * @brief Determines the filter block a key is added into.
-   *
-   * @note The number of filter blocks with Arrow policy must be in the
-   * range of [1, 4194304]. Passing a larger `num_blocks` will still
-   * upperbound the number of blocks used to the mentioned range.
-   *
-   * @tparam Extent Size type that is used to determine the number of blocks in the filter
-   *
-   * @param hash Hash value of the key
-   * @param num_blocks Number of block in the filter
-   *
-   * @return The block index for the given key's hash value
-   */
-  template <class Extent>
-  __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const
-  {
-    constexpr auto hash_bits = cuda::std::numeric_limits<word_type>::digits;
-    // TODO: assert if num_blocks > max_filter_blocks
-    auto const max_blocks = cuda::std::min<Extent>(num_blocks, max_filter_blocks);
-    // Make sure we are only contained withing the `max_filter_blocks` blocks
-    return static_cast<word_type>(((hash >> hash_bits) * max_blocks) >> hash_bits) % max_blocks;
-  }
-
-  /**
-   * @brief Determines the fingerprint pattern for a word/segment within the filter block for a
-   * given key's hash value.
-   *
-   * @param hash Hash value of the key
-   * @param word_index Target word/segment within the filter block
-   *
-   * @return The bit pattern for the word/segment in the filter block
-   */
-  __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const
-  {
-    word_type const key = static_cast<word_type>(hash);
-    std::uint32_t salt{};
-
-    // Basically a switch (word_index) { case 0-7 ... }
-    // First split: 0..3 versus 4..7.
-    if (word_index < 4) {
-      // For indices 0..3, further split into 0..1 and 2..3.
-      if (word_index < 2) {
-        // word_index is 0 or 1.
-        salt = (word_index == 0) ? 0x47b6137bU : 0x44974d91U;
-      } else {
-        // word_index is 2 or 3.
-        salt = (word_index == 2) ? 0x8824ad5bU : 0xa2b7289dU;
-      }
-    } else {
-      // For indices 4..7, further split into 4..5 and 6..7.
-      if (word_index < 6) {
-        // word_index is 4 or 5.
-        salt = (word_index == 4) ? 0x705495c7U : 0x2df1424bU;
-      } else {
-        // word_index is 6 or 7.
-        salt = (word_index == 6) ? 0x9efc4947U : 0x5c6bfb31U;
-      }
-    }
-    return word_type{1} << ((key * salt) >> 27);
-  }
-
- private:
-  hasher hash_;
-};
-
-}  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/bloom_filter/bloom_filter.inl b/include/cuco/detail/bloom_filter/bloom_filter.inl
index f3306b2fb..356e77b93 100644
--- a/include/cuco/detail/bloom_filter/bloom_filter.inl
+++ b/include/cuco/detail/bloom_filter/bloom_filter.inl
@@ -27,15 +27,15 @@
 namespace cuco {
 
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy, class Allocator>
-__host__ constexpr bloom_filter<Key, Extent, Scope, Policy, Allocator>::bloom_filter(
-  Extent num_blocks,
-  cuda_thread_scope<Scope>,
-  Policy const& policy,
-  Allocator const& alloc,
-  cuda::stream_ref stream)
+__host__ bloom_filter<Key, Extent, Scope, Policy, Allocator>::bloom_filter(Extent num_blocks,
+                                                                           cuda_thread_scope<Scope>,
+                                                                           Policy const& policy,
+                                                                           Allocator const& alloc,
+                                                                           cuda::stream_ref stream)
   : allocator_{alloc},
-    data_{allocator_.allocate(num_blocks, stream),
-          detail::custom_deleter<std::size_t, allocator_type>{num_blocks, allocator_, stream}},
+    data_{allocator_.allocate(static_cast<size_type>(num_blocks), stream),
+          detail::custom_deleter<std::size_t, allocator_type>{
+            static_cast<size_type>(num_blocks), allocator_, stream}},
     ref_{data_.get(), num_blocks, {}, policy}
 {
   this->clear_async(stream);
@@ -66,7 +66,7 @@ __host__ constexpr void bloom_filter<Key, Extent, Scope, Policy, Allocator>::add
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy, class Allocator>
 template <class InputIt>
 __host__ constexpr void bloom_filter<Key, Extent, Scope, Policy, Allocator>::add_async(
-  InputIt first, InputIt last, cuda::stream_ref stream)
+  InputIt first, InputIt last, cuda::stream_ref stream) noexcept
 {
   ref_.add_async(first, last, stream);
 }
diff --git a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
index 8e2b49453..9b543900d 100644
--- a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
+++ b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cuco/detail/__config>
 #include <cuco/detail/bloom_filter/kernels.cuh>
 #include <cuco/detail/error.hpp>
 #include <cuco/detail/utility/cuda.cuh>
@@ -23,6 +24,7 @@
 #include <cuco/detail/utility/math.cuh>
 #include <cuco/detail/utils.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
+#include <cuco/utility/traits.hpp>
 
 #include <cub/device/device_for.cuh>
 #include <cub/device/device_transform.cuh>
@@ -44,204 +46,6 @@
 
 namespace cuco::detail {
 
-/**
- * @brief Device functor for adding a single key to the bloom filter
- *
- * This functor is used with cuda::static_for to iterate over all words in a filter block
- * and set the appropriate bits for a given key's hash value. Each iteration processes
- * one word in the block using atomic operations to ensure thread safety.
- *
- * @tparam HashValue Type of the hash value (typically uint64_t)
- * @tparam BlockIndex Type of the block index (typically size_t or uint32_t)
- * @tparam Policy Filter policy type that provides word pattern generation
- * @tparam WordType Underlying word type of the filter (typically uint64_t)
- * @tparam Scope CUDA thread scope for atomic operations
- */
-template <typename HashValue,
-          typename BlockIndex,
-          typename Policy,
-          typename WordType,
-          cuda::thread_scope Scope>
-struct add_impl_functor {
-  HashValue hash_value;    ///< Hash value of the key being added
-  BlockIndex block_index;  ///< Index of the filter block to modify
-  Policy policy_;          ///< Filter policy for generating bit patterns
-  WordType* words_;        ///< Pointer to the filter's word array
-  size_t words_per_block;  ///< Number of words in each filter block
-
-  /**
-   * @brief Processes one word in the filter block for key insertion
-   *
-   * @tparam I Type of the integral constant passed by cuda::static_for
-   * @param i Integral constant representing the word index within the block
-   */
-  template <typename I>
-  __device__ void operator()(I i) const
-  {
-    auto const word = policy_.word_pattern(hash_value, i());
-    if (word != 0) {
-      auto atom_word =
-        cuda::atomic_ref<WordType, Scope>{*(words_ + (block_index * words_per_block + i()))};
-      atom_word.fetch_or(word, cuda::memory_order_relaxed);
-    }
-  }
-};
-
-/**
- * @brief Device functor for cooperative group-based batch key insertion
- *
- * This functor is used with cuda::static_for to process multiple keys in parallel
- * within a cooperative group. Each thread in the group processes a different key
- * using shuffle operations to share hash values and block indices across threads.
- *
- * @tparam CG Cooperative group type (e.g., thread_block_tile)
- * @tparam HashValue Type of the hash value
- * @tparam BlockIndex Type of the block index
- * @tparam BloomFilterImpl Type of the bloom filter implementation
- */
-template <typename CG, typename HashValue, typename BlockIndex, typename BloomFilterImpl>
-struct add_group_functor {
-  CG group;                ///< Cooperative group for parallel processing
-  HashValue hash_value;    ///< Hash value of the current thread's key
-  BlockIndex block_index;  ///< Block index of the current thread's key
-  size_t i;                ///< Starting index in the key batch
-  size_t num_keys;         ///< Total number of keys to process
-  size_t num_threads;      ///< Number of threads in the group
-  BloomFilterImpl* self;   ///< Pointer to the bloom filter implementation
-
-  /**
-   * @brief Processes one thread's key in the cooperative group batch insertion
-   *
-   * @tparam J Type of the integral constant passed by cuda::static_for
-   * @param j Integral constant representing the thread index within the group
-   */
-  template <typename J>
-  __device__ void operator()(J j) const
-  {
-    if ((j() < num_threads) and (i + j() < num_keys)) {
-      self->add_impl(group, group.shfl(hash_value, j()), group.shfl(block_index, j()));
-    }
-  }
-};
-
-/**
- * @brief Device functor for worker group-based batch key insertion
- *
- * This functor is used with cuda::static_for to process multiple keys in parallel
- * within a worker group (subdivision of a larger cooperative group). Similar to
- * add_group_functor but operates on a smaller worker group with offset handling
- * for processing different portions of the key batch.
- *
- * @tparam WorkerGroup Worker group type (subdivision of cooperative group)
- * @tparam HashValue Type of the hash value
- * @tparam BlockIndex Type of the block index
- * @tparam BloomFilterImpl Type of the bloom filter implementation
- */
-template <typename WorkerGroup, typename HashValue, typename BlockIndex, typename BloomFilterImpl>
-struct add_worker_group_functor {
-  WorkerGroup worker_group;   ///< Worker group (subdivision of cooperative group)
-  HashValue hash_value;       ///< Hash value of the current thread's key
-  BlockIndex block_index;     ///< Block index of the current thread's key
-  size_t i;                   ///< Starting index in the key batch
-  size_t worker_offset;       ///< Offset for this worker group within the batch
-  size_t num_keys;            ///< Total number of keys to process
-  size_t worker_num_threads;  ///< Number of threads in the worker group
-  BloomFilterImpl* self;      ///< Pointer to the bloom filter implementation
-
-  /**
-   * @brief Processes one thread's key in the worker group batch insertion
-   *
-   * @tparam J Type of the integral constant passed by cuda::static_for
-   * @param j Integral constant representing the thread index within the worker group
-   */
-  template <typename J>
-  __device__ void operator()(J j) const
-  {
-    if ((j() < worker_num_threads) and (i + worker_offset + j() < num_keys)) {
-      self->add_impl(
-        worker_group, worker_group.shfl(hash_value, j()), worker_group.shfl(block_index, j()));
-    }
-  }
-};
-
-/**
- * @brief Device functor for cooperative group-based single key insertion
- *
- * This functor is used with cuda::static_for to add a single key to the bloom filter
- * using a cooperative group. Each thread in the group processes different words in
- * the filter block based on thread rank and stride pattern. Used when the group size
- * doesn't match the number of words per block.
- *
- * @tparam HashValue Type of the hash value
- * @tparam BlockIndex Type of the block index
- * @tparam WordType Underlying word type of the filter
- * @tparam Scope CUDA thread scope for atomic operations
- * @tparam Policy Filter policy type that provides word pattern generation
- */
-template <typename HashValue,
-          typename BlockIndex,
-          typename WordType,
-          cuda::thread_scope Scope,
-          typename Policy>
-struct add_impl_group_functor {
-  HashValue hash_value;    ///< Hash value of the key being added
-  BlockIndex block_index;  ///< Index of the filter block to modify
-  WordType* words_;        ///< Pointer to the filter's word array
-  size_t words_per_block;  ///< Number of words in each filter block
-  size_t rank;             ///< Thread rank within the cooperative group
-  size_t num_threads;      ///< Number of threads in the cooperative group
-  Policy policy_;          ///< Filter policy for generating bit patterns
-
-  /**
-   * @brief Processes one word in the filter block using cooperative group stride pattern
-   *
-   * @tparam I Type of the integral constant passed by cuda::static_for
-   * @param i Integral constant representing the word index within the block
-   */
-  template <typename I>
-  __device__ void operator()(I i) const
-  {
-    if (i() >= rank && (i() - rank) % num_threads == 0) {
-      auto atom_word =
-        cuda::atomic_ref<WordType, Scope>{*(words_ + (block_index * words_per_block + i()))};
-      atom_word.fetch_or(policy_.word_pattern(hash_value, i()), cuda::memory_order_relaxed);
-    }
-  }
-};
-
-/**
- * @brief Device functor for checking if a key exists in the bloom filter
- *
- * This functor is used with cuda::static_for to iterate over all words in a filter block
- * and check if the expected bit patterns for a given key's hash value are present.
- * If any expected bit is missing, the result is set to false, indicating the key
- * is definitely not in the set.
- *
- * @tparam HashValue Type of the hash value
- * @tparam StoredPattern Type of the stored pattern array (typically array of WordType)
- * @tparam Policy Filter policy type that provides word pattern generation
- */
-template <typename HashValue, typename StoredPattern, typename Policy>
-struct contains_functor {
-  HashValue hash_value;          ///< Hash value of the key being queried
-  StoredPattern stored_pattern;  ///< Array of stored bit patterns from the filter block
-  Policy policy_;                ///< Filter policy for generating expected bit patterns
-  bool* result;                  ///< Pointer to result flag (set to false if key not found)
-
-  /**
-   * @brief Checks one word in the filter block for the expected bit pattern
-   *
-   * @tparam I Type of the integral constant passed by cuda::static_for
-   * @param i Integral constant representing the word index within the block
-   */
-  template <typename I>
-  __device__ void operator()(I i) const
-  {
-    auto const expected_pattern = policy_.word_pattern(hash_value, i());
-    if ((stored_pattern[i()] & expected_pattern) != expected_pattern) { *result = false; }
-  }
-};
-
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 class bloom_filter_impl {
  public:
@@ -250,26 +54,47 @@ class bloom_filter_impl {
   using size_type   = typename extent_type::value_type;
   using policy_type = Policy;
   using word_type   = typename policy_type::word_type;
+  static_assert(sizeof(word_type) == 4 || sizeof(word_type) == 8,
+                "word_type must be 4 or 8 bytes wide for atomicOr");
+  // atomicOr overloads resolve on canonical 32- and 64-bit unsigned integer types.
+  // Normalize by size so any policy-provided word_type (uint32_t, uint64_t, unsigned long, ...)
+  // resolves to a matching overload via the reinterpret_cast in atomic_or().
+  using atomic_word_type =
+    cuda::std::conditional_t<sizeof(word_type) == 8, unsigned long long, unsigned int>;
+
+  // Implementation-tuning knobs. Not part of the public API; reached via
+  // `bloom_filter_impl::tuning::use_*` from internal kernels. Defaults reflect the ablation
+  // measurements from arXiv:2512.15595; flip in source for tuning experiments.
+  struct tuning {
+    static constexpr bool use_invoke_one =
+#if defined(CUCO_HAS_CG_INVOKE_ONE)
+      true;
+#else
+      false;  // cg::invoke_one_broadcast requires CTK >= 12.1
+#endif
+    static constexpr bool use_early_exit                       = false;
+    static constexpr bool use_cub_kernels                      = true;
+    static constexpr bool use_warp_cooperative_add_kernel      = true;
+    static constexpr bool use_warp_cooperative_contains_kernel = true;
+    static constexpr bool use_work_stealing_add_kernel         = false;
+    static constexpr bool use_work_stealing_contains_kernel    = false;
+    static constexpr bool use_cuda_atomic_ref                  = false;
+  };
 
   static constexpr auto thread_scope    = Scope;
   static constexpr auto words_per_block = policy_type::words_per_block;
 
-  __host__ __device__ static constexpr size_t max_vec_bytes() noexcept
-  {
-    constexpr auto word_bytes  = sizeof(word_type);
-    constexpr auto block_bytes = word_bytes * words_per_block;
-    return cuda::std::min(cuda::std::max(word_bytes, 32ul),
-                          block_bytes);  // aiming for 2xLDG128 -> 1 sector per thread
-  }
-
-  struct alignas(max_vec_bytes()) filter_block_type {
-   private:
-    word_type data_[words_per_block];
-  };
+  static constexpr auto add_vertical_layout        = policy_type::add_vertical_layout;
+  static constexpr auto add_horizontal_layout      = policy_type::add_horizontal_layout;
+  static constexpr auto contains_vertical_layout   = policy_type::contains_vertical_layout;
+  static constexpr auto contains_horizontal_layout = policy_type::contains_horizontal_layout;
+  static constexpr auto add_loop_count =
+    words_per_block / (add_vertical_layout * add_horizontal_layout);
+  static constexpr auto contains_loop_count =
+    words_per_block / (contains_vertical_layout * contains_horizontal_layout);
 
   static_assert(cuda::std::has_single_bit(words_per_block) and words_per_block <= 32,
                 "Number of words per block must be a power-of-two and less than or equal to 32");
-
   static_assert(
     cuda::std::is_constructible_v<cuda::atomic_ref<word_type, Scope>, word_type&> &&
       cuda::std::is_invocable_r_v<word_type,
@@ -279,26 +104,44 @@ class bloom_filter_impl {
                                   cuda::std::memory_order>,
     "Invalid word type");
 
-  __host__ __device__ explicit constexpr bloom_filter_impl(filter_block_type* filter,
-                                                           Extent num_blocks,
-                                                           cuda_thread_scope<Scope>,
-                                                           Policy policy) noexcept
+  __host__ __device__ static constexpr size_t alignment() noexcept
+  {
+    // Maximum alignment is 32 bytes which is equivalent to one sector
+    return cuda::std::min(
+      static_cast<size_t>(32),
+      static_cast<size_t>(cuda::std::max(add_vertical_layout, contains_vertical_layout) *
+                          sizeof(word_type)));
+  }
+
+  struct filter_block_type {
+   private:
+    alignas(alignment()) word_type data_[words_per_block];
+  };
+
+  __host__ __device__ explicit bloom_filter_impl(filter_block_type* filter,
+                                                 Extent num_blocks,
+                                                 cuda_thread_scope<Scope>,
+                                                 Policy policy)
     : words_{reinterpret_cast<word_type*>(filter)}, num_blocks_{num_blocks}, policy_{policy}
   {
+    NV_IF_TARGET(NV_IS_HOST, (l2_cache_size_ = static_cast<size_t>(cuco::detail::l2_cache_size());))
   }
 
-  __host__ __device__ explicit constexpr bloom_filter_impl(word_type* filter,
-                                                           Extent num_blocks,
-                                                           cuda_thread_scope<Scope>,
-                                                           Policy policy) noexcept
+  __host__ __device__ explicit bloom_filter_impl(word_type* filter,
+                                                 Extent num_blocks,
+                                                 cuda_thread_scope<Scope>,
+                                                 Policy policy)
     : words_{filter}, num_blocks_{num_blocks}, policy_{policy}
   {
+    NV_IF_TARGET(NV_IS_HOST, (l2_cache_size_ = static_cast<size_t>(cuco::detail::l2_cache_size());))
   }
 
   template <class CG>
   __device__ constexpr void clear(CG group)
   {
-    for (int i = group.thread_rank(); i < num_blocks_ * words_per_block; i += group.size()) {
+    // TODO optimize this
+    for (int i = group.thread_rank(); i < static_cast<size_type>(num_blocks_) * words_per_block;
+         i += group.size()) {
       words_[i] = 0;
     }
   }
@@ -317,151 +160,218 @@ class bloom_filter_impl {
   {
     CUCO_CUDA_TRY(cub::DeviceFor::ForEachN(
       words_,
-      num_blocks_ * words_per_block,
+      static_cast<size_type>(num_blocks_) * words_per_block,
       [] __device__(word_type & word) { word = 0; },
       stream.get()));
   }
 
-  template <class ProbeKey>
-  __device__ void add(ProbeKey const& key)
+  __host__ constexpr void merge(bloom_filter_impl<Key, Extent, Scope, Policy> const& other,
+                                cuda::stream_ref stream)
   {
-    auto const hash_value = policy_.hash(key);
-    this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_));
+    this->merge_async(other, stream);
+#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
+    stream.sync();
+#else
+    stream.wait();
+#endif
   }
 
-  template <class InputIt>
-  __device__ void add(InputIt first, InputIt last)
+  __host__ constexpr void merge_async(bloom_filter_impl<Key, Extent, Scope, Policy> const& other,
+                                      cuda::stream_ref stream)
   {
-    auto const num_keys = cuco::detail::distance(first, last);
-    for (decltype(num_keys) i = 0; i < num_keys; ++i) {
-      auto const hash_value = policy_.hash(*(first + i));
-      this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_));
-    }
+    CUCO_EXPECTS(this->block_extent() == other.block_extent(),
+                 "mismatching num_blocks in merge_async");
+    CUCO_CUDA_TRY(cub::DeviceTransform::Transform(
+      cuda::std::tuple{this->data(), other.data()},
+      this->data(),
+      this->block_extent() * words_per_block,
+      [] __device__(word_type a, word_type b) { return a | b; },
+      stream.get()));
   }
 
-  template <class HashValue, class BlockIndex>
-  __device__ void add_impl(HashValue const& hash_value, BlockIndex block_index)
+  __host__ constexpr void intersect(bloom_filter_impl<Key, Extent, Scope, Policy> const& other,
+                                    cuda::stream_ref stream)
   {
-    add_impl_functor<HashValue, BlockIndex, policy_type, word_type, thread_scope> functor{
-      hash_value, block_index, policy_, words_, words_per_block};
-    cuda::static_for<words_per_block>(functor);
+    this->intersect_async(other, stream);
+#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
+    stream.sync();
+#else
+    stream.wait();
+#endif
   }
 
-  template <class CG, class ProbeKey>
-  __device__ void add(CG group, ProbeKey const& key)
+  __host__ constexpr void intersect_async(
+    bloom_filter_impl<Key, Extent, Scope, Policy> const& other, cuda::stream_ref stream)
+  {
+    CUCO_EXPECTS(this->block_extent() == other.block_extent(),
+                 "mismatching num_blocks in intersect_async");
+    CUCO_CUDA_TRY(cub::DeviceTransform::Transform(
+      cuda::std::tuple{this->data(), other.data()},
+      this->data(),
+      this->block_extent() * words_per_block,
+      [] __device__(word_type a, word_type b) { return a & b; },
+      stream.get()));
+  }
+
+  [[nodiscard]] __host__ __device__ constexpr word_type* data() noexcept { return words_; }
+
+  [[nodiscard]] __host__ __device__ constexpr word_type const* data() const noexcept
   {
-    constexpr auto num_threads         = tile_size_v<CG>;
-    constexpr auto optimal_num_threads = add_optimal_cg_size();
-    constexpr auto worker_num_threads =
-      (num_threads < optimal_num_threads) ? num_threads : optimal_num_threads;
+    return words_;
+  }
 
-    // If single thread is optimal, use scalar add
-    if constexpr (worker_num_threads == 1) {
-      this->add(key);
+  [[nodiscard]] __host__ __device__ constexpr extent_type block_extent() const noexcept
+  {
+    return num_blocks_;
+  }
+
+  template <bool ConditionalAtomic = true, class BuildKey>
+  __device__ void add(BuildKey build_key)
+  {
+    auto const [upper_hash, lower_hash] = policy_.split_hash(build_key);
+    auto const block_index              = policy_.block_index(upper_hash, num_blocks_);
+
+    if constexpr (add_horizontal_layout == 1) {
+      add_pattern<ConditionalAtomic, 0>(block_index, lower_hash);
     } else {
-      auto const hash_value = policy_.hash(key);
-      this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_));
+#pragma unroll
+      for (uint32_t thread_index = 0; thread_index < add_horizontal_layout; ++thread_index) {
+        add_patterns<ConditionalAtomic, 0>(block_index, lower_hash, thread_index);
+      }
     }
   }
 
-  template <class CG, class InputIt>
-  __device__ void add(CG group, InputIt first, InputIt last)
+  template <bool ConditionalAtomic = true, class CG, class BuildKey>
+  __device__ void add(CG group, BuildKey build_key)
   {
     namespace cg = cooperative_groups;
 
-    constexpr auto num_threads         = tile_size_v<CG>;
-    constexpr auto optimal_num_threads = add_optimal_cg_size();
-    constexpr auto worker_num_threads =
-      (num_threads < optimal_num_threads) ? num_threads : optimal_num_threads;
-
-    auto const num_keys = cuco::detail::distance(first, last);
-    if (num_keys == 0) { return; }
+    if constexpr (add_horizontal_layout == 1 || tile_size_v<CG> != add_horizontal_layout) {
+      // Tile size doesn't match the layout. Pick one lane to do the scalar (layout-agnostic)
+      // insert; the rest wait at the implicit sync.
+#if defined(CUCO_HAS_CG_INVOKE_ONE)
+      cg::invoke_one(group, [&] __device__() { this->template add<ConditionalAtomic>(build_key); });
+#else
+      if (group.thread_rank() == 0) { this->template add<ConditionalAtomic>(build_key); }
+      group.sync();
+#endif
+    } else {
+      auto const [upper_hash, lower_hash, block_index] = [&] __device__ {
+#if defined(CUCO_HAS_CG_INVOKE_ONE)
+        if constexpr (tuning::use_invoke_one) {
+          return cg::invoke_one_broadcast(group, [&] __device__() {
+            auto const sh = policy_.split_hash(build_key);
+            return cuda::std::make_tuple(
+              sh.first, sh.second, policy_.block_index(sh.first, num_blocks_));
+          });
+        } else
+#endif
+        {
+          auto const sh = policy_.split_hash(build_key);
+          return cuda::std::make_tuple(
+            sh.first, sh.second, policy_.block_index(sh.first, num_blocks_));
+        }
+      }();
 
-    auto const rank = group.thread_rank();
+      add_patterns<ConditionalAtomic, 0>(block_index, lower_hash, group.thread_rank());
+    }
+  }
 
-    // If single thread is optimal, use scalar add
-    if constexpr (worker_num_threads == 1) {
-      for (auto i = rank; i < num_keys; i += num_threads) {
-        typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
-          *(first + i)};
-        this->add(insert_element);
-      }
-    } else if constexpr (num_threads == worker_num_threads) {  // given CG is optimal CG
-      typename policy_type::hash_result_type hash_value;
-      size_type block_index;
-
-      auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads);
-      for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) {
-        if (i + rank < num_keys) {
-          typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
-            *(first + i + rank)};
-          hash_value  = policy_.hash(insert_element);
-          block_index = policy_.block_index(hash_value, num_blocks_);
-        }
+  template <bool ConditionalAtomic = true, class CG, class BuildKey>
+  __device__ void add_coop(CG group, BuildKey build_key)
+  {
+    constexpr auto num_threads = tile_size_v<CG>;
 
-        add_group_functor<CG, decltype(hash_value), decltype(block_index), bloom_filter_impl>
-          functor{group,
-                  hash_value,
-                  block_index,
-                  static_cast<size_t>(i),
-                  static_cast<size_t>(num_keys),
-                  static_cast<size_t>(num_threads),
-                  this};
-        cuda::static_for<num_threads>(functor);
-      }
-    } else {  // subdivide given CG into multiple optimal CGs
-      typename policy_type::hash_result_type hash_value;
-      size_type block_index;
+    auto const [upper_hash, lower_hash] = policy_.split_hash(build_key);
+    auto const block_index              = policy_.block_index(upper_hash, num_blocks_);
 
-      auto const worker_group  = cg::tiled_partition<worker_num_threads, CG>(group);
-      auto const worker_offset = worker_num_threads * worker_group.meta_group_rank();
+#pragma unroll num_threads
+    for (int i = 0; i < num_threads; ++i) {
+      add_patterns<ConditionalAtomic, 0>(
+        group.shfl(block_index, i), group.shfl(lower_hash, i), group.thread_rank());
+    }
+  }
 
-      auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads);
+  template <bool ConditionalAtomic = true, class CG, class InputIt, class Index>
+  __device__ void add_coop(CG group, InputIt first, Index idx, bool is_valid)
+  {
+    constexpr auto num_threads = tile_size_v<CG>;
 
-      for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) {
-        if (i + rank < num_keys) {
-          typename cuda::std::iterator_traits<InputIt>::value_type const& key{*(first + i + rank)};
-          hash_value  = policy_.hash(key);
-          block_index = policy_.block_index(hash_value, num_blocks_);
-        }
+    uint32_t upper_hash   = 0;
+    uint32_t lower_hash   = 0;
+    size_type block_index = 0;
+    if (is_valid) {
+      auto const& key = *(first + idx);
+      auto const sh   = policy_.split_hash(key);
+      upper_hash      = sh.first;
+      lower_hash      = sh.second;
+      block_index     = policy_.block_index(upper_hash, num_blocks_);
+    }
 
-        add_worker_group_functor<decltype(worker_group),
-                                 decltype(hash_value),
-                                 decltype(block_index),
-                                 bloom_filter_impl>
-          functor{worker_group,
-                  hash_value,
-                  block_index,
-                  static_cast<size_t>(i),
-                  static_cast<size_t>(worker_offset),
-                  static_cast<size_t>(num_keys),
-                  static_cast<size_t>(worker_num_threads),
-                  this};
-        cuda::static_for<worker_num_threads>(functor);
+#pragma unroll num_threads
+    for (int i = 0; i < num_threads; ++i) {
+      if (group.shfl(is_valid, i)) {
+        add_patterns<ConditionalAtomic, 0>(
+          group.shfl(block_index, i), group.shfl(lower_hash, i), group.thread_rank());
       }
     }
   }
 
-  template <class CG, class HashValue, class BlockIndex>
-  __device__ void add_impl(CG group, HashValue const& hash_value, BlockIndex block_index)
+  template <class CG, class InputIt>
+  __device__ void add(CG group, InputIt first, InputIt last)
   {
-    constexpr auto num_threads = tile_size_v<CG>;
+    auto num_keys = cuco::detail::distance(first, last);
+    if constexpr (tile_size_v<CG> == add_horizontal_layout && add_horizontal_layout > 1) {
+      auto constexpr num_threads = static_cast<decltype(num_keys)>(tile_size_v<CG>);
+      for (decltype(num_keys) batch = 0; batch < num_keys; batch += num_threads) {
+        auto const idx      = batch + static_cast<decltype(num_keys)>(group.thread_rank());
+        auto const is_valid = idx < num_keys;
+        this->template add_coop<true>(group, first, idx, is_valid);
+      }
+    } else {
+      auto const stride = static_cast<decltype(num_keys)>(tile_size_v<CG>);
+      for (auto i = static_cast<decltype(num_keys)>(group.thread_rank()); i < num_keys;
+           i += stride) {
+        this->add(*(first + i));
+      }
+    }
+  }
 
-    auto const rank = group.thread_rank();
+  template <class InputIt>
+  __host__ void add_async(InputIt first, InputIt last, cuda::stream_ref stream) noexcept
+  {
+    auto const num_keys = cuco::detail::distance(first, last);
+    if (num_keys == 0) { return; }
 
-    if constexpr (num_threads == words_per_block) {
-      auto atom_word = cuda::atomic_ref<word_type, thread_scope>{
-        *(words_ + (block_index * words_per_block + rank))};
-      atom_word.fetch_or(policy_.word_pattern(hash_value, rank), cuda::memory_order_relaxed);
+    auto constexpr block_size = 256;
+    auto constexpr cg_size    = static_cast<int32_t>(add_horizontal_layout);
+    auto const grid_size      = tuning::use_warp_cooperative_add_kernel
+                                  ? cuco::detail::int_div_ceil(num_keys, block_size)
+                                  : cuco::detail::int_div_ceil(num_keys * cg_size, block_size);
+    auto const filter_size    = static_cast<size_t>(static_cast<size_type>(num_blocks_)) *
+                             words_per_block * sizeof(word_type);
+
+    if (2 * filter_size < l2_cache_size_) {
+      if constexpr (tuning::use_work_stealing_add_kernel) {
+        detail::bloom_filter_ns::add_work_stealing_n<false, cg_size, block_size>
+          <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, *this);
+      } else {
+        detail::bloom_filter_ns::add_n<false, cg_size, block_size>
+          <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, *this);
+      }
     } else {
-      add_impl_group_functor<HashValue, BlockIndex, word_type, thread_scope, policy_type> functor{
-        hash_value, block_index, words_, words_per_block, rank, num_threads, policy_};
-      cuda::static_for<words_per_block>(functor);
+      if constexpr (tuning::use_work_stealing_add_kernel) {
+        detail::bloom_filter_ns::add_work_stealing_n<true, cg_size, block_size>
+          <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, *this);
+      } else {
+        detail::bloom_filter_ns::add_n<true, cg_size, block_size>
+          <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, *this);
+      }
     }
   }
 
   template <class InputIt>
-  __host__ constexpr void add(InputIt first, InputIt last, cuda::stream_ref stream)
+  __host__ void add(InputIt first, InputIt last, cuda::stream_ref stream) noexcept
   {
     this->add_async(first, last, stream);
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
@@ -471,125 +381,165 @@ class bloom_filter_impl {
 #endif
   }
 
-  template <class InputIt>
-  __host__ constexpr void add_async(InputIt first, InputIt last, cuda::stream_ref stream)
+  template <class ProbeKey>
+  __device__ bool contains(ProbeKey probe_key) const
   {
-    auto const num_keys = cuco::detail::distance(first, last);
-    if (num_keys == 0) { return; }
+    auto const [upper_hash, lower_hash] = policy_.split_hash(probe_key);
+    auto const block_index              = policy_.block_index(upper_hash, num_blocks_);
 
-    if constexpr (words_per_block == 1) {
-      CUCO_CUDA_TRY(cub::DeviceFor::ForEachCopyN(
-        first,
-        num_keys,
-        [*this] __device__(key_type const key) mutable { this->add(key); },
-        stream.get()));
+    if constexpr (contains_horizontal_layout == 1) {
+      return compare_pattern<0>(block_index, lower_hash);
     } else {
-      auto const num_keys = cuco::detail::distance(first, last);
-      if (num_keys == 0) { return; }
-
-      auto constexpr block_size = cuco::detail::default_block_size();
-      void const* kernel        = reinterpret_cast<void const*>(
-        detail::bloom_filter_ns::add<block_size, InputIt, bloom_filter_impl>);
-      auto const grid_size = cuco::detail::max_occupancy_grid_size(block_size, kernel);
-
-      detail::bloom_filter_ns::add<block_size>
-        <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, *this);
+      bool result = true;
+#pragma unroll
+      for (uint32_t thread_index = 0; thread_index < contains_horizontal_layout; ++thread_index) {
+        result = result && compare_patterns<0>(block_index, lower_hash, thread_index);
+      }
+      return result;
     }
   }
 
-  template <class InputIt, class StencilIt, class Predicate>
-  __host__ constexpr void add_if(
-    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream)
+  template <class CG, class ProbeKey>
+  __device__ bool contains(CG group, ProbeKey probe_key) const
   {
-    this->add_if_async(first, last, stencil, pred, stream);
-#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
-    stream.sync();
+    namespace cg = cooperative_groups;
+
+    if constexpr (contains_horizontal_layout == 1 ||
+                  tile_size_v<CG> != contains_horizontal_layout) {
+#if defined(CUCO_HAS_CG_INVOKE_ONE)
+      return cg::invoke_one_broadcast(
+        group, [&] __device__() -> bool { return this->contains(probe_key); });
 #else
-    stream.wait();
+      // All lanes recompute the same deterministic scalar query.
+      return this->contains(probe_key);
 #endif
+    } else {
+      auto const [upper_hash, lower_hash, block_index] = [&] __device__ {
+#if defined(CUCO_HAS_CG_INVOKE_ONE)
+        if constexpr (tuning::use_invoke_one) {
+          return cg::invoke_one_broadcast(group, [&] __device__() {
+            auto const sh = policy_.split_hash(probe_key);
+            return cuda::std::make_tuple(
+              sh.first, sh.second, policy_.block_index(sh.first, num_blocks_));
+          });
+        } else
+#endif
+        {
+          auto const sh = policy_.split_hash(probe_key);
+          return cuda::std::make_tuple(
+            sh.first, sh.second, policy_.block_index(sh.first, num_blocks_));
+        }
+      }();
+
+      return group.all(compare_patterns<0>(block_index, lower_hash, group.thread_rank()));
+    }
   }
 
-  template <class InputIt, class StencilIt, class Predicate>
-  __host__ constexpr void add_if_async(InputIt first,
-                                       InputIt last,
-                                       StencilIt stencil,
-                                       Predicate pred,
-                                       cuda::stream_ref stream) noexcept
+  template <class CG, class ProbeKey>
+  __device__ bool contains_coop(CG group, ProbeKey probe_key) const
   {
-    auto const num_keys = cuco::detail::distance(first, last);
-    if (num_keys == 0) { return; }
+    constexpr auto num_threads = tile_size_v<CG>;
 
-    auto constexpr cg_size    = add_optimal_cg_size();
-    auto constexpr block_size = cuco::detail::default_block_size();
-    auto const grid_size =
-      cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size);
+    auto const [upper_hash, lower_hash] = policy_.split_hash(probe_key);
+    auto const block_index              = policy_.block_index(upper_hash, num_blocks_);
+    bool result_out                     = false;
 
-    detail::bloom_filter_ns::add_if_n<cg_size, block_size>
-      <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, stencil, pred, *this);
+#pragma unroll num_threads
+    for (int i = 0; i < num_threads; ++i) {
+      auto const result = group.all(compare_patterns<0>(
+        group.shfl(block_index, i), group.shfl(lower_hash, i), group.thread_rank()));
+      if (i == group.thread_rank()) { result_out = result; }
+    }
+    return result_out;
   }
 
-  template <class ProbeKey>
-  [[nodiscard]] __device__ bool contains(ProbeKey const& key) const
+  template <class CG, class InputIt, class Index>
+  __device__ bool contains_coop(CG group, InputIt first, Index idx, bool is_valid) const
   {
-    auto const hash_value = policy_.hash(key);
-
-    auto const stored_pattern = this->vec_load_words<words_per_block>(
-      policy_.block_index(hash_value, num_blocks_) * words_per_block);
+    constexpr auto num_threads = tile_size_v<CG>;
 
-    bool result = true;
-    contains_functor<decltype(hash_value), decltype(stored_pattern), policy_type> functor{
-      hash_value, stored_pattern, policy_, &result};
-    cuda::static_for<words_per_block>(functor);
-    if (!result) { return false; }
+    uint32_t upper_hash   = 0;
+    uint32_t lower_hash   = 0;
+    size_type block_index = 0;
+    if (is_valid) {
+      auto const& key = *(first + idx);
+      auto const sh   = policy_.split_hash(key);
+      upper_hash      = sh.first;
+      lower_hash      = sh.second;
+      block_index     = policy_.block_index(upper_hash, num_blocks_);
+    }
 
-    return true;
+    bool result_out = false;
+#pragma unroll num_threads
+    for (int i = 0; i < num_threads; ++i) {
+      if (group.shfl(is_valid, i)) {
+        auto const result = group.all(compare_patterns<0>(
+          group.shfl(block_index, i), group.shfl(lower_hash, i), group.thread_rank()));
+        if (i == group.thread_rank()) { result_out = result; }
+      }
+    }
+    return result_out;
   }
 
-  template <class CG, class ProbeKey>
-  [[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const
+  template <class CG, class InputIt, class OutputIt>
+  __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin) const
   {
-    constexpr auto num_threads         = tile_size_v<CG>;
-    constexpr auto optimal_num_threads = contains_optimal_cg_size();
-    constexpr auto words_per_thread    = words_per_block / optimal_num_threads;
-
-    // If single thread is optimal, use scalar contains
-    if constexpr (num_threads == 1 or optimal_num_threads == 1) {
-      return this->contains(key);
+    auto num_keys = cuco::detail::distance(first, last);
+    if constexpr (tile_size_v<CG> == contains_horizontal_layout && contains_horizontal_layout > 1) {
+      auto constexpr num_threads = static_cast<decltype(num_keys)>(tile_size_v<CG>);
+      for (decltype(num_keys) batch = 0; batch < num_keys; batch += num_threads) {
+        auto const idx      = batch + static_cast<decltype(num_keys)>(group.thread_rank());
+        auto const is_valid = idx < num_keys;
+        auto const result   = this->contains_coop(group, first, idx, is_valid);
+        if (is_valid) { *(output_begin + idx) = result; }
+      }
     } else {
-      auto const rank       = group.thread_rank();
-      auto const hash_value = policy_.hash(key);
-      bool success          = true;
+      auto const stride = static_cast<decltype(num_keys)>(tile_size_v<CG>);
+      for (auto i = static_cast<decltype(num_keys)>(group.thread_rank()); i < num_keys;
+           i += stride) {
+        *(output_begin + i) = this->contains(*(first + i));
+      }
+    }
+  }
 
-// Use pragma unroll instead of cuda::static_for to avoid CUDA 12.0 compatibility issues
-#pragma unroll
-      for (size_type i = 0; i < optimal_num_threads; ++i) {
-        if (i >= rank && (i - rank) % num_threads == 0) {
-          auto const thread_offset  = i * words_per_thread;
-          auto const stored_pattern = this->vec_load_words<words_per_thread>(
-            policy_.block_index(hash_value, num_blocks_) * words_per_block + thread_offset);
+  template <class InputIt, class OutputIt>
+  __host__ void contains_async(InputIt first,
+                               InputIt last,
+                               OutputIt output_begin,
+                               cuda::stream_ref stream) const noexcept
+  {
+    auto const num_keys = cuco::detail::distance(first, last);
+    if (num_keys == 0) { return; }
 
-#pragma unroll
-          for (size_type j = 0; j < words_per_thread; ++j) {
-            auto const expected_pattern = policy_.word_pattern(hash_value, thread_offset + j);
-            if ((stored_pattern[j] & expected_pattern) != expected_pattern) { success = false; }
-          }
-        }
+    if constexpr (tuning::use_cub_kernels and ((words_per_block / contains_vertical_layout) == 1)) {
+      cub::DeviceTransform::Transform(
+        first,
+        output_begin,
+        num_keys,
+        [*this] __device__(auto const& key) { return this->contains(key); },
+        stream.get());
+    } else {
+      auto constexpr block_size = 256;
+      auto constexpr cg_size    = static_cast<int32_t>(contains_horizontal_layout);
+      auto const grid_size      = tuning::use_warp_cooperative_contains_kernel
+                                    ? cuco::detail::int_div_ceil(num_keys, block_size)
+                                    : cuco::detail::int_div_ceil(num_keys * cg_size, block_size);
+
+      if constexpr (tuning::use_work_stealing_contains_kernel) {
+        detail::bloom_filter_ns::contains_work_stealing_n<cg_size, block_size>
+          <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, output_begin, *this);
+      } else {
+        detail::bloom_filter_ns::contains_n<cg_size, block_size>
+          <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, output_begin, *this);
       }
-
-      return group.all(success);
     }
   }
 
-  // TODO
-  // template <class CG, class InputIt, class OutputIt>
-  // __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin)
-  // const;
-
   template <class InputIt, class OutputIt>
   __host__ void contains(InputIt first,
                          InputIt last,
                          OutputIt output_begin,
-                         cuda::stream_ref stream) const
+                         cuda::stream_ref stream) const noexcept
   {
     this->contains_async(first, last, output_begin, stream);
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
@@ -599,25 +549,41 @@ class bloom_filter_impl {
 #endif
   }
 
-  template <class InputIt, class OutputIt>
-  __host__ void contains_async(InputIt first,
-                               InputIt last,
-                               OutputIt output_begin,
-                               cuda::stream_ref stream) const noexcept
+  template <class InputIt, class StencilIt, class Predicate>
+  __host__ void add_if_async(InputIt first,
+                             InputIt last,
+                             StencilIt stencil,
+                             Predicate pred,
+                             cuda::stream_ref stream) noexcept
   {
-    auto const always_true = cuda::constant_iterator<bool>{true};
-    this->contains_if_async(first, last, always_true, cuda::std::identity{}, output_begin, stream);
+    auto const num_keys = cuco::detail::distance(first, last);
+    if (num_keys == 0) { return; }
+
+    auto constexpr block_size = 256;
+    auto constexpr cg_size    = static_cast<int32_t>(add_horizontal_layout);
+    auto const grid_size      = tuning::use_warp_cooperative_add_kernel
+                                  ? cuco::detail::int_div_ceil(num_keys, block_size)
+                                  : cuco::detail::int_div_ceil(num_keys * cg_size, block_size);
+    auto const filter_size    = static_cast<size_t>(static_cast<size_type>(num_blocks_)) *
+                             words_per_block * sizeof(word_type);
+
+    if (2 * filter_size < l2_cache_size_) {
+      detail::bloom_filter_ns::add_if_n<false, cg_size, block_size>
+        <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, stencil, pred, *this);
+    } else {
+      detail::bloom_filter_ns::add_if_n<true, cg_size, block_size>
+        <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, stencil, pred, *this);
+    }
   }
 
-  template <class InputIt, class StencilIt, class Predicate, class OutputIt>
-  __host__ void contains_if(InputIt first,
-                            InputIt last,
-                            StencilIt stencil,
-                            Predicate pred,
-                            OutputIt output_begin,
-                            cuda::stream_ref stream) const
+  template <class InputIt, class StencilIt, class Predicate>
+  __host__ void add_if(InputIt first,
+                       InputIt last,
+                       StencilIt stencil,
+                       Predicate pred,
+                       cuda::stream_ref stream) noexcept
   {
-    this->contains_if_async(first, last, stencil, pred, output_begin, stream);
+    this->add_if_async(first, last, stencil, pred, stream);
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
     stream.sync();
 #else
@@ -636,20 +602,26 @@ class bloom_filter_impl {
     auto const num_keys = cuco::detail::distance(first, last);
     if (num_keys == 0) { return; }
 
-    auto constexpr cg_size    = contains_optimal_cg_size();
-    auto constexpr block_size = cuco::detail::default_block_size();
-    auto const grid_size =
-      cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size);
+    auto constexpr block_size = 256;
+    auto constexpr cg_size    = static_cast<int32_t>(contains_horizontal_layout);
+    auto const grid_size      = tuning::use_warp_cooperative_contains_kernel
+                                  ? cuco::detail::int_div_ceil(num_keys, block_size)
+                                  : cuco::detail::int_div_ceil(num_keys * cg_size, block_size);
 
     detail::bloom_filter_ns::contains_if_n<cg_size, block_size>
       <<<grid_size, block_size, 0, stream.get()>>>(
         first, num_keys, stencil, pred, output_begin, *this);
   }
 
-  __host__ constexpr void merge(bloom_filter_impl<Key, Extent, Scope, Policy> const& other,
-                                cuda::stream_ref stream)
+  template <class InputIt, class StencilIt, class Predicate, class OutputIt>
+  __host__ void contains_if(InputIt first,
+                            InputIt last,
+                            StencilIt stencil,
+                            Predicate pred,
+                            OutputIt output_begin,
+                            cuda::stream_ref stream) const noexcept
   {
-    this->merge_async(other, stream);
+    this->contains_if_async(first, last, stencil, pred, output_begin, stream);
 #if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
     stream.sync();
 #else
@@ -657,86 +629,174 @@ class bloom_filter_impl {
 #endif
   }
 
-  __host__ constexpr void merge_async(bloom_filter_impl<Key, Extent, Scope, Policy> const& other,
-                                      cuda::stream_ref stream)
-  {
-    CUCO_EXPECTS(this->block_extent() == other.block_extent(),
-                 "mismatching num_blocks in merge_async");
-    CUCO_CUDA_TRY(cub::DeviceTransform::Transform(
-      cuda::std::tuple{this->data(), other.data()},
-      this->data(),
-      this->block_extent() * words_per_block,
-      [] __device__(word_type a, word_type b) { return a | b; },
-      stream.get()));
-  }
+  // TODO
+  // [[nodiscard]] __host__ double occupancy() const;
+  // [[nodiscard]] __host__ double expected_false_positive_rate(size_t unique_keys) const
+  // [[nodiscard]] __host__ __device__ static uint32_t optimal_pattern_bits(size_t num_blocks)
+  // template <typename CG, cuda::thread_scope NewScope = thread_scope>
+  // [[nodiscard]] __device__ constexpr auto make_copy(CG group, word_type* const
+  // memory_to_use, cuda_thread_scope<NewScope> scope = {}) const noexcept;
 
-  __host__ constexpr void intersect(bloom_filter_impl<Key, Extent, Scope, Policy> const& other,
-                                    cuda::stream_ref stream)
+  //  private:
+  template <uint32_t NumWords>
+  __device__ constexpr cuda::std::array<word_type, NumWords> vec_load_words(size_type index) const
   {
-    this->intersect_async(other, stream);
-#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
-    stream.sync();
-#else
-    stream.wait();
-#endif
+    // The block storage is aligned to `alignment()`, but a per-lane load at offset `index` is
+    // only guaranteed to be aligned to `min(NumWords * sizeof(word_type), alignment())`. Hand the
+    // compiler the alignment that's actually delivered, not the block-level maximum.
+    constexpr auto load_alignment =
+      cuda::std::min<size_t>(NumWords * sizeof(word_type), alignment());
+    return *reinterpret_cast<cuda::std::array<word_type, NumWords>*>(
+      __builtin_assume_aligned(words_ + index, load_alignment));
   }
 
-  __host__ constexpr void intersect_async(
-    bloom_filter_impl<Key, Extent, Scope, Policy> const& other, cuda::stream_ref stream)
+  template <bool ConditionalAtomic, uint32_t LoopIndex>
+  __device__ constexpr void add_pattern(uint32_t block_index, uint32_t lower_hash)
   {
-    CUCO_EXPECTS(this->block_extent() == other.block_extent(),
-                 "mismatching num_blocks in intersect_async");
-    CUCO_CUDA_TRY(cub::DeviceTransform::Transform(
-      cuda::std::tuple{this->data(), other.data()},
-      this->data(),
-      this->block_extent() * words_per_block,
-      [] __device__(word_type a, word_type b) { return a & b; },
-      stream.get()));
-  }
+    static_assert(add_horizontal_layout == 1, "add_pattern() requires add_horizontal_layout == 1");
 
-  [[nodiscard]] __host__ __device__ constexpr word_type* data() noexcept { return words_; }
+    if constexpr (LoopIndex < add_loop_count) {
+      auto const pattern =
+        policy_.template array_pattern<LoopIndex, add_vertical_layout>(lower_hash);
+      auto* word_base = words_ + block_index * words_per_block + LoopIndex * add_vertical_layout;
 
-  [[nodiscard]] __host__ __device__ constexpr word_type const* data() const noexcept
-  {
-    return words_;
+      for (int i = 0; i < add_vertical_layout; ++i) {
+        atomic_or<ConditionalAtomic>(word_base + i, pattern[i]);
+      }
+
+      // Recurse.
+      add_pattern<ConditionalAtomic, LoopIndex + 1>(block_index, lower_hash);
+    }
   }
 
-  [[nodiscard]] __host__ __device__ constexpr extent_type block_extent() const noexcept
+  template <bool ConditionalAtomic, uint32_t LoopIndex>
+  __device__ constexpr void add_patterns(uint32_t block_index,
+                                         uint32_t lower_hash,
+                                         uint32_t thread_index)
   {
-    return num_blocks_;
+    static_assert(add_horizontal_layout > 1, "add_patterns() requires add_horizontal_layout > 1");
+
+    if constexpr (LoopIndex < add_loop_count) {
+      auto const pattern =
+        policy_.template array_pattern<LoopIndex, add_horizontal_layout, add_vertical_layout>(
+          lower_hash, thread_index);
+      auto* word_base = words_ + block_index * words_per_block +
+                        LoopIndex * add_vertical_layout * add_horizontal_layout +
+                        thread_index * add_vertical_layout;
+
+      for (int i = 0; i < add_vertical_layout; ++i) {
+        atomic_or<ConditionalAtomic>(word_base + i, pattern[i]);
+      }
+
+      // Recurse.
+      add_patterns<ConditionalAtomic, LoopIndex + 1>(block_index, lower_hash, thread_index);
+    }
   }
 
-  // TODO
-  // [[nodiscard]] __host__ double occupancy() const;
-  // [[nodiscard]] __host__ double expected_false_positive_rate(size_t unique_keys) const
-  // [[nodiscard]] __host__ __device__ static uint32_t optimal_pattern_bits(size_t num_blocks)
-  // template <typename CG, cuda::thread_scope NewScope = thread_scope>
-  // [[nodiscard]] __device__ constexpr auto make_copy(CG group, word_type* const
-  // memory_to_use, cuda_thread_scope<NewScope> scope = {}) const noexcept;
+  template <bool ConditionalAtomic>
+  __device__ constexpr void atomic_or(word_type* word_ptr, word_type pattern) const
+  {
+    auto const do_or = [&]() {
+      if constexpr (tuning::use_cuda_atomic_ref) {
+        cuda::atomic_ref<word_type, thread_scope>{*word_ptr}.fetch_or(pattern,
+                                                                      cuda::memory_order_relaxed);
+      } else {
+        auto* const p = reinterpret_cast<atomic_word_type*>(word_ptr);
+        auto const v  = static_cast<atomic_word_type>(pattern);
+        if constexpr (thread_scope == cuda::thread_scope_thread) {
+          *p |= v;
+        } else if constexpr (thread_scope == cuda::thread_scope_block) {
+          atomicOr_block(p, v);
+        } else if constexpr (thread_scope == cuda::thread_scope_device) {
+          atomicOr(p, v);
+        } else if constexpr (thread_scope == cuda::thread_scope_system) {
+          atomicOr_system(p, v);
+        } else {
+          static_assert(cuco::dependent_false<word_type>,
+                        "unsupported cuda::thread_scope for native atomic_or");
+        }
+      }
+    };
 
- private:
-  template <uint32_t NumWords>
-  __device__ constexpr cuda::std::array<word_type, NumWords> vec_load_words(size_type index) const
-  {
-    return *reinterpret_cast<cuda::std::array<word_type, NumWords>*>(__builtin_assume_aligned(
-      words_ + index, cuda::std::min(sizeof(word_type) * NumWords, max_vec_bytes())));
+    if constexpr (ConditionalAtomic) {
+      if ((*word_ptr & pattern) != pattern) { do_or(); }
+    } else {
+      do_or();
+    }
   }
 
-  [[nodiscard]] __host__ __device__ static constexpr int32_t add_optimal_cg_size()
+  /// Compare the stored pattern against the expected pattern for the given hash value.
+  // Precondition: contains_horizontal_layout == 1
+  template <uint32_t LoopIndex>
+  __device__ constexpr bool compare_pattern(uint32_t block_index, uint32_t lower_hash) const
   {
-    return words_per_block;  // one thread per word so atomic updates can be coalesced
+    static_assert(contains_horizontal_layout == 1,
+                  "compare_pattern() requires contains_horizontal_layout == 1");
+
+    if constexpr (LoopIndex < contains_loop_count) {
+      auto const stored_pattern = this->vec_load_words<contains_vertical_layout>(
+        block_index * words_per_block + LoopIndex * contains_vertical_layout);
+      auto const expected_pattern =
+        policy_.template array_pattern<LoopIndex, contains_vertical_layout>(lower_hash);
+
+      bool match = true;
+      for (int i = 0; i < contains_vertical_layout; ++i) {
+        match &= (stored_pattern[i] & expected_pattern[i]) == expected_pattern[i];
+      }
+
+      // Recurse.
+      // Early exit in this implementation occurs at the granulairy of contains_vertical_layout
+      // words.
+      if constexpr (tuning::use_early_exit) {
+        if (!match) { return false; }
+        return compare_pattern<LoopIndex + 1>(block_index, lower_hash);
+      } else {
+        return compare_pattern<LoopIndex + 1>(block_index, lower_hash) && match;
+      }
+    } else {
+      return true;
+    }
   }
 
-  [[nodiscard]] __host__ __device__ static constexpr int32_t contains_optimal_cg_size()
+  template <uint32_t LoopIndex>
+  __device__ constexpr bool compare_patterns(uint32_t block_index,
+                                             uint32_t lower_hash,
+                                             uint32_t thread_index) const
   {
-    constexpr auto word_bytes  = sizeof(word_type);
-    constexpr auto block_bytes = word_bytes * words_per_block;
-    return block_bytes / max_vec_bytes();  // one vector load per thread
+    static_assert(contains_horizontal_layout > 1,
+                  "compare_patterns() requires HorizontalLayout > 1");
+
+    if constexpr (LoopIndex < contains_loop_count) {
+      auto const stored_pattern = this->vec_load_words<contains_vertical_layout>(
+        block_index * words_per_block +
+        LoopIndex * contains_vertical_layout * contains_horizontal_layout +
+        thread_index * contains_vertical_layout);
+      auto const expected_pattern =
+        policy_
+          .template array_pattern<LoopIndex, contains_horizontal_layout, contains_vertical_layout>(
+            lower_hash, thread_index);
+
+      bool match = true;
+      for (int i = 0; i < contains_vertical_layout; ++i) {
+        match &= (stored_pattern[i] & expected_pattern[i]) == expected_pattern[i];
+      }
+
+      // Per-thread early exit: short-circuit this thread's recursion if its slice already missed.
+      if constexpr (tuning::use_early_exit) {
+        if (!match) { return false; }
+        return compare_patterns<LoopIndex + 1>(block_index, lower_hash, thread_index);
+      } else {
+        return compare_patterns<LoopIndex + 1>(block_index, lower_hash, thread_index) && match;
+      }
+    } else {
+      return true;
+    }
   }
 
   word_type* words_;
   extent_type num_blocks_;
   policy_type policy_;
+  size_t l2_cache_size_ = 0;
 };
 
 }  // namespace cuco::detail
diff --git a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
index b251dceee..df95af888 100644
--- a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
+++ b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,14 +24,14 @@
 namespace cuco {
 
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
-__host__ __device__ constexpr bloom_filter_ref<Key, Extent, Scope, Policy>::bloom_filter_ref(
+__host__ __device__ bloom_filter_ref<Key, Extent, Scope, Policy>::bloom_filter_ref(
   filter_block_type* data, Extent num_blocks, cuda_thread_scope<Scope>, Policy const& policy)
   : impl_{data, num_blocks, {}, policy}
 {
 }
 
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
-__host__ __device__ constexpr bloom_filter_ref<Key, Extent, Scope, Policy>::bloom_filter_ref(
+__host__ __device__ bloom_filter_ref<Key, Extent, Scope, Policy>::bloom_filter_ref(
   word_type* data, Extent num_blocks, cuda_thread_scope<Scope>, Policy const& policy)
   : impl_{data, num_blocks, {}, policy}
 {
@@ -92,7 +92,7 @@ __host__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::add(InputI
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class InputIt>
 __host__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::add_async(
-  InputIt first, InputIt last, cuda::stream_ref stream)
+  InputIt first, InputIt last, cuda::stream_ref stream) noexcept
 {
   impl_.add_async(first, last, stream);
 }
@@ -129,6 +129,16 @@ template <class CG, class ProbeKey>
   return impl_.contains(group, key);
 }
 
+template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
+template <class CG, class InputIt, class OutputIt>
+__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::contains(CG group,
+                                                                       InputIt first,
+                                                                       InputIt last,
+                                                                       OutputIt output_begin) const
+{
+  impl_.contains(group, first, last, output_begin);
+}
+
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class InputIt, class OutputIt>
 __host__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::contains(
diff --git a/include/cuco/detail/bloom_filter/default_filter_policy.inl b/include/cuco/detail/bloom_filter/default_filter_policy.inl
deleted file mode 100644
index eb8dbf703..000000000
--- a/include/cuco/detail/bloom_filter/default_filter_policy.inl
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cstdint>
-
-namespace cuco {
-
-template <class Hash, class Word, uint32_t WordsPerBlock>
-__host__
-  __device__ constexpr default_filter_policy<Hash, Word, WordsPerBlock>::default_filter_policy(
-    uint32_t pattern_bits, Hash hash)
-  : impl_{pattern_bits, hash}
-{
-}
-
-template <class Hash, class Word, uint32_t WordsPerBlock>
-__device__ constexpr typename default_filter_policy<Hash, Word, WordsPerBlock>::hash_result_type
-default_filter_policy<Hash, Word, WordsPerBlock>::hash(
-  typename default_filter_policy<Hash, Word, WordsPerBlock>::hash_argument_type const& key) const
-{
-  return impl_.hash(key);
-}
-
-template <class Hash, class Word, uint32_t WordsPerBlock>
-template <class Extent>
-__device__ constexpr auto default_filter_policy<Hash, Word, WordsPerBlock>::block_index(
-  typename default_filter_policy<Hash, Word, WordsPerBlock>::hash_result_type hash,
-  Extent num_blocks) const
-{
-  return impl_.block_index(hash, num_blocks);
-}
-
-template <class Hash, class Word, uint32_t WordsPerBlock>
-__device__ constexpr typename default_filter_policy<Hash, Word, WordsPerBlock>::word_type
-default_filter_policy<Hash, Word, WordsPerBlock>::word_pattern(
-  default_filter_policy<Hash, Word, WordsPerBlock>::hash_result_type hash,
-  std::uint32_t word_index) const
-{
-  return impl_.word_pattern(hash, word_index);
-}
-
-}  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh b/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh
deleted file mode 100644
index 14509b9b0..000000000
--- a/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cuco/detail/error.hpp>
-
-#include <cuda/std/bit>
-#include <cuda/std/limits>
-#include <cuda/std/tuple>
-#include <cuda/std/type_traits>
-
-#include <cstdint>
-#include <nv/target>
-
-namespace cuco::detail {
-
-template <class Hash, class Word, uint32_t WordsPerBlock>
-class default_filter_policy_impl {
- public:
-  using hasher             = Hash;
-  using word_type          = Word;
-  using hash_argument_type = typename hasher::argument_type;
-  using hash_result_type   = decltype(std::declval<hasher>()(std::declval<hash_argument_type>()));
-
-  static constexpr std::uint32_t words_per_block = WordsPerBlock;
-
- private:
-  static constexpr std::uint32_t word_bits       = cuda::std::numeric_limits<word_type>::digits;
-  static constexpr std::uint32_t bit_index_width = cuda::std::bit_width(word_bits - 1);
-
- public:
-  __host__ __device__ explicit constexpr default_filter_policy_impl(uint32_t pattern_bits,
-                                                                    Hash hash)
-    : pattern_bits_{pattern_bits},
-      min_bits_per_word_{pattern_bits_ / words_per_block},
-      remainder_bits_{pattern_bits_ % words_per_block},
-      hash_{hash}
-  {
-    NV_DISPATCH_TARGET(
-      NV_IS_HOST,
-      (  // This ensures each word in the block has at least one bit set; otherwise we would never
-         // use some of the words
-        constexpr uint32_t min_pattern_bits = words_per_block;
-
-        // The maximum number of bits to be set for a key is capped by the total number of bits in
-        // the filter block
-        constexpr uint32_t max_pattern_bits = word_bits * words_per_block;
-
-        constexpr uint32_t hash_bits = cuda::std::numeric_limits<hash_result_type>::digits;
-        constexpr uint32_t max_pattern_bits_from_hash = hash_bits / bit_index_width;
-        CUCO_EXPECTS(
-          pattern_bits <= max_pattern_bits_from_hash,
-          "`hash_result_type` too narrow to generate the requested number of `pattern_bits`");
-        CUCO_EXPECTS(pattern_bits_ >= min_pattern_bits,
-                     "`pattern_bits` must be at least `words_per_block`");
-        CUCO_EXPECTS(pattern_bits_ <= max_pattern_bits,
-                     "`pattern_bits` must be less than the total number of bits in a filter "
-                     "block");))
-    // TODO find a proper way to perform input checks/assertions on device without destroying the
-    // context (e.g. __trap())
-  }
-
-  __device__ constexpr hash_result_type hash(hash_argument_type const& key) const
-  {
-    return hash_(key);
-  }
-
-  template <class Extent>
-  __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const
-  {
-    return hash % num_blocks;
-  }
-
-  __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const
-  {
-    word_type constexpr bit_index_mask = (word_type{1} << bit_index_width) - 1;
-
-    auto const bits_so_far = min_bits_per_word_ * word_index +
-                             (word_index < remainder_bits_ ? word_index : remainder_bits_);
-
-    hash >>= bits_so_far * bit_index_width;
-
-    word_type word              = 0;
-    int32_t const bits_per_word = min_bits_per_word_ + (word_index < remainder_bits_ ? 1 : 0);
-
-    for (int32_t bit = 0; bit < bits_per_word; ++bit) {
-      word |= word_type{1} << (hash & bit_index_mask);
-      hash >>= bit_index_width;
-    }
-
-    return word;
-  }
-
- private:
-  uint32_t pattern_bits_;
-  uint32_t min_bits_per_word_;
-  uint32_t remainder_bits_;
-  hasher hash_;
-};
-
-}  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/bloom_filter/kernels.cuh b/include/cuco/detail/bloom_filter/kernels.cuh
index 6c918fb7d..d4264a2b1 100644
--- a/include/cuco/detail/bloom_filter/kernels.cuh
+++ b/include/cuco/detail/bloom_filter/kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,42 +15,325 @@
  */
 #pragma once
 
+#include <cuco/detail/__config>
 #include <cuco/detail/utility/cuda.cuh>
 #include <cuco/detail/utility/math.cuh>
 
+#include <cuda/ptx>
 #include <cuda/std/iterator>
 
 #include <cooperative_groups.h>
 
 #include <cstdint>
+#include <nv/target>
 
 namespace cuco::detail::bloom_filter_ns {
 
 CUCO_SUPPRESS_KERNEL_WARNINGS
+template <bool ConditionalAtomic, int32_t CGSize, int32_t BlockSize, class InputIt, class Ref>
+__device__ void add_n_impl(InputIt first, cuco::detail::index_type n, Ref ref)
+{
+  namespace cg   = cooperative_groups;
+  using key_type = typename cuda::std::iterator_traits<InputIt>::value_type;
+
+  // Only use warp-cooperative kernels when CGSize > 1
+  if constexpr (Ref::tuning::use_warp_cooperative_add_kernel && CGSize > 1) {
+    auto const idx          = cuco::detail::global_thread_id();
+    auto group              = cg::tiled_partition<CGSize>(cg::this_thread_block());
+    auto const is_full_tile = (blockIdx.x + 1) * BlockSize <= n;
+    if (is_full_tile) {
+      key_type const& key = *(first + idx);
+      ref.add_coop<ConditionalAtomic>(group, key);
+    } else {
+      auto const is_valid = idx < n;
+      ref.add_coop<ConditionalAtomic>(group, first, idx, is_valid);
+    }
+  } else {
+    auto const idx = cuco::detail::global_thread_id() / CGSize;
+    if constexpr (CGSize == 1) {
+      if (idx < n) {
+        key_type const& key = *(first + idx);
+        ref.add<ConditionalAtomic>(key);
+      }
+    } else {
+      auto group = cg::tiled_partition<CGSize>(cg::this_thread_block());
+      if (idx < n) {
+        key_type const& key = *(first + idx);
+        ref.add<ConditionalAtomic>(group, key);
+      }
+    }
+  }
+}
+
+template <bool ConditionalAtomic, int32_t CGSize, int32_t BlockSize, class InputIt, class Ref>
+CUCO_KERNEL __launch_bounds__(BlockSize) void add_n(InputIt first,
+                                                    cuco::detail::index_type n,
+                                                    Ref ref)
+{
+  add_n_impl<ConditionalAtomic, CGSize, BlockSize>(first, n, ref);
+}
 
-template <int32_t BlockSize, class InputIt, class Ref>
-CUCO_KERNEL __launch_bounds__(BlockSize) void add(InputIt first,
-                                                  cuco::detail::index_type n,
-                                                  Ref ref)
+template <bool ConditionalAtomic, int32_t CGSize, int32_t BlockSize, class InputIt, class Ref>
+__device__ void add_work_stealing_n_impl(InputIt first, cuco::detail::index_type n, Ref ref)
 {
-  namespace cg = cooperative_groups;
+#if __cccl_ptx_isa >= 860
+  using key_type = typename cuda::std::iterator_traits<InputIt>::value_type;
+
+  namespace cg  = cooperative_groups;
+  namespace ptx = cuda::ptx;
+
+  // Cluster launch control initialization:
+  __shared__ uint4 result;
+  __shared__ uint64_t bar;
+  int phase = 0;
 
-  constexpr auto tile_size = cuco::detail::warp_size();
+  auto const block = cg::this_thread_block();
 
-  auto const tile_idx       = cuco::detail::global_thread_id() / tile_size;
-  auto const n_tiles        = gridDim.x * BlockSize / tile_size;
-  auto const items_per_tile = cuco::detail::int_div_ceil(n, n_tiles);
+  cg::invoke_one(block, [&]() { ptx::mbarrier_init(&bar, 1); });
 
-  auto const tile_start = tile_idx * items_per_tile;
-  if (tile_start >= n) { return; }
-  auto const tile_stop = (tile_start + items_per_tile < n) ? tile_start + items_per_tile : n;
+  int bx = blockIdx.x;
 
-  auto const tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
+  // Work-stealing loop:
+  while (true) {
+    // Protect result from overwrite in the next iteration,
+    // (also ensure barrier initialization at 1st iteration):
+    block.sync();
+
+    cg::invoke_one(block, [&]() {
+      // Acquire write of result in the async proxy:
+      ptx::fence_proxy_async_generic_sync_restrict(
+        ptx::sem_acquire, ptx::space_cluster, ptx::scope_cluster);
+
+      cg::invoke_one(cg::coalesced_threads(),
+                     [&]() { ptx::clusterlaunchcontrol_try_cancel(&result, &bar); });
+      ptx::mbarrier_arrive_expect_tx(
+        ptx::sem_relaxed, ptx::scope_cta, ptx::space_shared, &bar, sizeof(uint4));
+    });
+
+    // Computation:
+    // Only use warp-cooperative kernels when CGSize > 1
+    if constexpr (Ref::tuning::use_warp_cooperative_add_kernel && CGSize > 1) {
+      cuco::detail::index_type const idx = BlockSize * bx + threadIdx.x;
+      auto group                         = cg::tiled_partition<CGSize>(block);
+      auto const is_full_tile            = (bx + 1) * BlockSize <= n;
+      if (is_full_tile) {
+        key_type const& key = *(first + idx);
+        ref.add_coop<ConditionalAtomic>(group, key);
+      } else {
+        auto const is_valid = idx < n;
+        ref.add_coop<ConditionalAtomic>(group, first, idx, is_valid);
+      }
+    } else {
+      cuco::detail::index_type const idx =
+        (static_cast<cuco::detail::index_type>(BlockSize) * bx + threadIdx.x) / CGSize;
+      if constexpr (CGSize == 1) {
+        if (idx < n) {
+          key_type const& key = *(first + idx);
+          ref.add<ConditionalAtomic>(key);
+        }
+      } else {
+        auto group = cg::tiled_partition<CGSize>(block);
+        if (idx < n) {
+          key_type const& key = *(first + idx);
+          ref.add<ConditionalAtomic>(group, key);
+        }
+      }
+    }
 
-  ref.add(tile, first + tile_start, first + tile_stop);
+    // Cancellation request synchronization:
+    while (!ptx::mbarrier_try_wait_parity(ptx::sem_acquire, ptx::scope_cta, &bar, phase)) {}
+    phase ^= 1;
+
+    // Cancellation request decoding:
+    bool success = ptx::clusterlaunchcontrol_query_cancel_is_canceled(result);
+    if (!success) break;
+
+    bx = ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_x<int>(result);
+
+    // Release read of result to the async proxy:
+    ptx::fence_proxy_async_generic_sync_restrict(
+      ptx::sem_release, ptx::space_shared, ptx::scope_cluster);
+  }
+#else
+  // Cluster launch control PTX intrinsics require PTX ISA >= 8.6 (CTK 12.8+).
+  // On older toolkits, fall back to the regular kernel body. The runtime path is
+  // also guarded by `NV_PROVIDES_SM_100`, so this branch is unreachable when SM100
+  // is targeted but the toolkit is too old to expose the intrinsics.
+  add_n_impl<ConditionalAtomic, CGSize, BlockSize>(first, n, ref);
+#endif
 }
 
-template <int32_t CGSize,
+template <bool ConditionalAtomic, int32_t CGSize, int32_t BlockSize, class InputIt, class Ref>
+CUCO_KERNEL __launch_bounds__(BlockSize) void add_work_stealing_n(InputIt first,
+                                                                  cuco::detail::index_type n,
+                                                                  Ref ref)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_100,
+    (add_work_stealing_n_impl<ConditionalAtomic, CGSize, BlockSize>(first, n, ref);),
+    (add_n_impl<ConditionalAtomic, CGSize, BlockSize>(first, n, ref);))
+}
+
+template <int32_t CGSize, int32_t BlockSize, class InputIt, class OutputIt, class Ref>
+__device__ void contains_n_impl(InputIt first,
+                                cuco::detail::index_type n,
+                                OutputIt output_begin,
+                                Ref ref)
+{
+  namespace cg   = cooperative_groups;
+  using key_type = typename cuda::std::iterator_traits<InputIt>::value_type;
+
+  // Only use warp-cooperative kernels when CGSize > 1
+  if constexpr (Ref::tuning::use_warp_cooperative_contains_kernel && CGSize > 1) {
+    auto const idx          = cuco::detail::global_thread_id();
+    auto group              = cg::tiled_partition<CGSize>(cg::this_thread_block());
+    auto const is_full_tile = (blockIdx.x + 1) * BlockSize <= n;
+    if (is_full_tile) {
+      key_type const& key   = *(first + idx);
+      *(output_begin + idx) = ref.contains_coop(group, key);
+    } else {
+      auto const is_valid = idx < n;
+      auto const result   = ref.contains_coop(group, first, idx, is_valid);
+      if (is_valid) { *(output_begin + idx) = result; }
+    }
+  } else {
+    auto idx = cuco::detail::global_thread_id() / CGSize;
+    if constexpr (CGSize == 1) {
+      if (idx < n) {
+        key_type const& key   = *(first + idx);
+        *(output_begin + idx) = ref.contains(key);
+      }
+    } else {
+      auto group = cg::tiled_partition<CGSize>(cg::this_thread_block());
+      if (idx < n) {
+        key_type const& key = *(first + idx);
+        // ref.contains(group, key) already reduces across the group via group.all(...).
+        auto const found = ref.contains(group, key);
+        if (group.thread_rank() == 0) { *(output_begin + idx) = found; }
+      }
+    }
+  }
+}
+
+template <int32_t CGSize, int32_t BlockSize, class InputIt, class OutputIt, class Ref>
+CUCO_KERNEL __launch_bounds__(BlockSize) void contains_n(InputIt first,
+                                                         cuco::detail::index_type n,
+                                                         OutputIt output_begin,
+                                                         Ref ref)
+{
+  contains_n_impl<CGSize, BlockSize>(first, n, output_begin, ref);
+}
+
+template <int32_t CGSize, int32_t BlockSize, class InputIt, class OutputIt, class Ref>
+__device__ void contains_work_stealing_n_impl(InputIt first,
+                                              cuco::detail::index_type n,
+                                              OutputIt output_begin,
+                                              Ref ref)
+{
+#if __cccl_ptx_isa >= 860
+  using key_type = typename cuda::std::iterator_traits<InputIt>::value_type;
+
+  namespace cg  = cooperative_groups;
+  namespace ptx = cuda::ptx;
+
+  // Cluster launch control initialization:
+  __shared__ uint4 result;
+  __shared__ uint64_t bar;
+  int phase = 0;
+
+  auto const block = cg::this_thread_block();
+
+  cg::invoke_one(block, [&]() { ptx::mbarrier_init(&bar, 1); });
+
+  int bx = blockIdx.x;
+
+  // Work-stealing loop:
+  while (true) {
+    // Protect result from overwrite in the next iteration,
+    // (also ensure barrier initialization at 1st iteration):
+    block.sync();
+
+    cg::invoke_one(block, [&]() {
+      // Acquire write of result in the async proxy:
+      ptx::fence_proxy_async_generic_sync_restrict(
+        ptx::sem_acquire, ptx::space_cluster, ptx::scope_cluster);
+
+      cg::invoke_one(cg::coalesced_threads(),
+                     [&]() { ptx::clusterlaunchcontrol_try_cancel(&result, &bar); });
+      ptx::mbarrier_arrive_expect_tx(
+        ptx::sem_relaxed, ptx::scope_cta, ptx::space_shared, &bar, sizeof(uint4));
+    });
+
+    // Computation:
+    // Only use warp-cooperative kernels when CGSize > 1
+    if constexpr (Ref::tuning::use_warp_cooperative_contains_kernel && CGSize > 1) {
+      cuco::detail::index_type const idx = BlockSize * bx + threadIdx.x;
+      auto group                         = cg::tiled_partition<CGSize>(block);
+      auto const is_full_tile            = (bx + 1) * BlockSize <= n;
+      if (is_full_tile) {
+        key_type const& key   = *(first + idx);
+        *(output_begin + idx) = ref.contains_coop(group, key);
+      } else {
+        auto const is_valid = idx < n;
+        auto const result   = ref.contains_coop(group, first, idx, is_valid);
+        if (is_valid) { *(output_begin + idx) = result; }
+      }
+    } else {
+      cuco::detail::index_type const idx =
+        (static_cast<cuco::detail::index_type>(BlockSize) * bx + threadIdx.x) / CGSize;
+      if constexpr (CGSize == 1) {
+        if (idx < n) {
+          key_type const& key   = *(first + idx);
+          *(output_begin + idx) = ref.contains(key);
+        }
+      } else {
+        auto group = cg::tiled_partition<CGSize>(block);
+        if (idx < n) {
+          key_type const& key = *(first + idx);
+          // ref.contains(group, key) already reduces across the group via group.all(...).
+          auto const found = ref.contains(group, key);
+          if (group.thread_rank() == 0) { *(output_begin + idx) = found; }
+        }
+      }
+    }
+
+    // Cancellation request synchronization:
+    while (!ptx::mbarrier_try_wait_parity(ptx::sem_acquire, ptx::scope_cta, &bar, phase)) {}
+    phase ^= 1;
+
+    // Cancellation request decoding:
+    bool success = ptx::clusterlaunchcontrol_query_cancel_is_canceled(result);
+    if (!success) break;
+
+    bx = ptx::clusterlaunchcontrol_query_cancel_get_first_ctaid_x<int>(result);
+
+    // Release read of result to the async proxy:
+    ptx::fence_proxy_async_generic_sync_restrict(
+      ptx::sem_release, ptx::space_shared, ptx::scope_cluster);
+  }
+#else
+  // Cluster launch control PTX intrinsics require PTX ISA >= 8.6 (CTK 12.8+).
+  // On older toolkits, fall back to the regular kernel body. The runtime path is
+  // also guarded by `NV_PROVIDES_SM_100`, so this branch is unreachable when SM100
+  // is targeted but the toolkit is too old to expose the intrinsics.
+  contains_n_impl<CGSize, BlockSize>(first, n, output_begin, ref);
+#endif
+}
+
+template <int32_t CGSize, int32_t BlockSize, class InputIt, class OutputIt, class Ref>
+CUCO_KERNEL __launch_bounds__(BlockSize) void contains_work_stealing_n(InputIt first,
+                                                                       cuco::detail::index_type n,
+                                                                       OutputIt output_begin,
+                                                                       Ref ref)
+{
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_100,
+    (contains_work_stealing_n_impl<CGSize, BlockSize>(first, n, output_begin, ref);),
+    (contains_n_impl<CGSize, BlockSize>(first, n, output_begin, ref);))
+}
+
+template <bool ConditionalAtomic,
+          int32_t CGSize,
           int32_t BlockSize,
           class InputIt,
           class StencilIt,
@@ -59,21 +342,26 @@ template <int32_t CGSize,
 CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n(
   InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
 {
-  namespace cg = cooperative_groups;
-
-  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
-  auto idx               = cuco::detail::global_thread_id() / CGSize;
+  namespace cg   = cooperative_groups;
+  using key_type = typename cuda::std::iterator_traits<InputIt>::value_type;
 
-  [[maybe_unused]] auto const tile =
-    cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
-
-  while (idx < n) {
-    if (pred(*(stencil + idx))) {
-      typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
-        *(first + idx)};
-      ref.add(tile, insert_element);
+  if constexpr (Ref::tuning::use_warp_cooperative_add_kernel && CGSize > 1) {
+    auto const idx      = cuco::detail::global_thread_id();
+    auto group          = cg::tiled_partition<CGSize>(cg::this_thread_block());
+    auto const in_range = idx < n;
+    auto const is_valid = in_range && pred(*(stencil + idx));
+    ref.template add_coop<ConditionalAtomic>(group, first, idx, is_valid);
+  } else {
+    auto const idx = cuco::detail::global_thread_id() / CGSize;
+    if (idx < n && pred(*(stencil + idx))) {
+      key_type const& key = *(first + idx);
+      if constexpr (CGSize == 1) {
+        ref.template add<ConditionalAtomic>(key);
+      } else {
+        auto group = cg::tiled_partition<CGSize>(cg::this_thread_block());
+        ref.template add<ConditionalAtomic>(group, key);
+      }
     }
-    idx += loop_stride;
   }
 }
 
@@ -88,30 +376,39 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
                                                             cuco::detail::index_type n,
                                                             StencilIt stencil,
                                                             Predicate pred,
-                                                            OutputIt out,
+                                                            OutputIt output_begin,
                                                             Ref ref)
 {
-  namespace cg = cooperative_groups;
+  namespace cg   = cooperative_groups;
+  using key_type = typename cuda::std::iterator_traits<InputIt>::value_type;
 
-  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
-  auto idx               = cuco::detail::global_thread_id() / CGSize;
-
-  [[maybe_unused]] auto const tile =
-    cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
-
-  if constexpr (CGSize == 1) {
-    while (idx < n) {
-      typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
-      *(out + idx) = pred(*(stencil + idx)) ? ref.contains(key) : false;
-      idx += loop_stride;
-    }
+  if constexpr (Ref::tuning::use_warp_cooperative_contains_kernel && CGSize > 1) {
+    auto const idx      = cuco::detail::global_thread_id();
+    auto group          = cg::tiled_partition<CGSize>(cg::this_thread_block());
+    auto const in_range = idx < n;
+    auto const is_valid = in_range && pred(*(stencil + idx));
+    auto const result   = ref.contains_coop(group, first, idx, is_valid);
+    if (in_range) { *(output_begin + idx) = is_valid ? result : false; }
   } else {
-    auto const tile = cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
-    while (idx < n) {
-      typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
-      auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
-      if (tile.thread_rank() == 0) { *(out + idx) = found; }
-      idx += loop_stride;
+    auto const idx = cuco::detail::global_thread_id() / CGSize;
+    if (idx < n) {
+      if constexpr (CGSize == 1) {
+        if (pred(*(stencil + idx))) {
+          key_type const& key   = *(first + idx);
+          *(output_begin + idx) = ref.contains(key);
+        } else {
+          *(output_begin + idx) = false;
+        }
+      } else {
+        auto group  = cg::tiled_partition<CGSize>(cg::this_thread_block());
+        bool result = false;
+        if (pred(*(stencil + idx))) {
+          key_type const& key = *(first + idx);
+          // ref.contains(group, key) already reduces across the group via group.all(...).
+          result = ref.contains(group, key);
+        }
+        if (group.thread_rank() == 0) { *(output_begin + idx) = result; }
+      }
     }
   }
 }
diff --git a/include/cuco/detail/bloom_filter/parametric_filter_policy.cuh b/include/cuco/detail/bloom_filter/parametric_filter_policy.cuh
new file mode 100644
index 000000000..c4e753249
--- /dev/null
+++ b/include/cuco/detail/bloom_filter/parametric_filter_policy.cuh
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/utility/math.cuh>
+
+#include <cuda/std/__algorithm/min.h>
+#include <cuda/std/array>
+#include <cuda/std/bit>
+#include <cuda/std/limits>
+#include <cuda/std/tuple>
+#include <cuda/std/type_traits>
+
+#include <cstdint>
+
+namespace cuco::detail {
+
+/**
+ * @brief Sectorized Bloom filter policy with multiplicative-hashing fingerprint generation.
+ *
+ * Implements the Sectorized Bloom Filter (SBF) variant from "Optimizing Bloom Filters for Modern
+ * GPU Architectures" (arXiv:2512.15595). Distributes `PatternBits` set bits across `WordsPerBlock`
+ * words using compile-time salt-based multiplicative hashing. The hash result is split into upper
+ * 32 bits (block selection via multiply-shift) and lower 32 bits (pattern generation), so a 64-bit
+ * hash function is required by design.
+ *
+ * @tparam Hash 64-bit hash functor whose return type satisfies `is_same_v<hash_result_type,
+ * uint64_t>`.
+ * @tparam Word Underlying word type of a filter block. Must be an atomically updatable integral.
+ * @tparam WordsPerBlock Words per filter block. Must be a power of two and <= 32.
+ * @tparam PatternBits Number of fingerprint bits (k in the paper).
+ * @tparam AddHorizontalLayout CG size used for `add` (paper's Theta). Must be a power of two and
+ * `AddHorizontalLayout * AddVerticalLayout <= WordsPerBlock`.
+ * @tparam AddVerticalLayout Contiguous words processed per thread per `add` step (paper's Phi).
+ * @tparam ContainsHorizontalLayout CG size used for `contains` (paper's Theta).
+ * @tparam ContainsVerticalLayout Contiguous words processed per thread per `contains` step (paper's
+ * Phi).
+ */
+template <class Hash,
+          class Word,
+          uint32_t WordsPerBlock,
+          uint32_t PatternBits,
+          uint32_t AddHorizontalLayout,
+          uint32_t AddVerticalLayout,
+          uint32_t ContainsHorizontalLayout,
+          uint32_t ContainsVerticalLayout>
+class parametric_filter_policy {
+ public:
+  using hasher             = Hash;                            ///< 64-bit hash functor type
+  using word_type          = Word;                            ///< Underlying filter-block word type
+  using hash_argument_type = typename hasher::argument_type;  ///< Hash function input type
+  using hash_result_type =
+    decltype(std::declval<hasher>()(std::declval<hash_argument_type>()));  ///< Hash function
+                                                                           ///< output type
+
+ private:
+  static constexpr uint32_t max_salts                          = 64;
+  static constexpr cuda::std::array<uint32_t, max_salts> salts = {
+    0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU, 0x705495c7U, 0x2df1424bU, 0x9efc4947U,
+    0x5c6bfb31U, 0xb24bcdffU, 0xb6843d6dU, 0x6db04543U, 0x3a12efddU, 0xb0ddd463U, 0x8d22f6e7U,
+    0xb82f1e53U, 0x7db9f86bU, 0xc7afe639U, 0xfb135cd7U, 0x693256e1U, 0x9466d871U, 0x23d3d02fU,
+    0x6461d049U, 0x66a91621U, 0xbaa3006fU, 0x52fb8d99U, 0x3ea88b4fU, 0xf470cfdU,  0xb1db79a5U,
+    0x9809fcd1U, 0xbced4445U, 0x2eb7c737U, 0x2cea6803U, 0x156f1955U, 0x8813c027U, 0xa26819f9U,
+    0x4c3b57bdU, 0x7df94487U, 0xb975e769U, 0xb8f20cb5U, 0x5c9e2e77U, 0x5fb1735fU, 0x3a6f759bU,
+    0x3c090923U, 0xfced424dU, 0xa187a6a9U, 0x6f070a41U, 0x2c85233bU, 0x7e62258bU, 0x2771ef17U,
+    0x13bbf093U, 0x4ff059e5U, 0xe3ce3d0fU, 0xf1b4789fU, 0x9fbb6173U, 0x6a320cf5U, 0x1be2c481U,
+    0x7ba8222bU, 0x6fd619b3U, 0x7b1bbf0dU, 0x8b8993adU, 0x448eca95U, 0x82ab09d9U, 0x2ce53909U,
+    0x4f548685U};
+  static constexpr uint32_t word_bits = cuda::std::numeric_limits<word_type>::digits;
+
+ public:
+  static constexpr uint32_t words_per_block = WordsPerBlock;  ///< Number of words per filter block
+  static constexpr uint32_t pattern_bits    = PatternBits;    ///< Fingerprint bits per key
+
+  static constexpr uint32_t add_horizontal_layout =
+    AddHorizontalLayout;  ///< horizontal vectorization layout for add operation
+  static constexpr uint32_t add_vertical_layout =
+    AddVerticalLayout;  ///< vertical vectorization layout for add operation
+  static constexpr uint32_t contains_horizontal_layout =
+    ContainsHorizontalLayout;  ///< horizontal vectorization layout for contains operation
+  static constexpr uint32_t contains_vertical_layout =
+    ContainsVerticalLayout;  ///< vertical vectorization layout for contains operation
+
+  static constexpr size_t max_filter_blocks =
+    cuda::std::numeric_limits<uint32_t>::max();  ///< Upper bound on the number of filter blocks
+  /// Lower bound on `pattern_bits`: at least one bit per word so every word contributes.
+  static constexpr auto min_pattern_bits = words_per_block;
+  /// Upper bound on `pattern_bits`: the total number of bits in a filter block, capped by the
+  /// number of available salts.
+  static constexpr auto max_pattern_bits = cuda::std::min(word_bits * words_per_block, max_salts);
+
+ private:
+  static constexpr uint32_t bit_index_width = cuda::std::bit_width(word_bits - 1);
+  // TODO: for non-multiple `(pattern_bits, words_per_block)` configs (e.g. PatternBits=12,
+  // WordsPerBlock=8), the salt walk in `set_bits` advances `PatternArrayIndex` every
+  // `max_bits_per_word` salts, packing all bits into the first
+  // `ceil(pattern_bits / words_per_block)` words and leaving the rest at zero. This wastes block
+  // capacity and inflates FPR. Distribute floor bits to every word plus one extra bit to the
+  // first `pattern_bits % words_per_block` words, and update the salt-to-word mapping in
+  // `set_bits` accordingly.
+  static constexpr uint32_t max_bits_per_word =
+    cuco::detail::int_div_ceil(pattern_bits, words_per_block);
+
+ public:
+  /**
+   * @brief Constructs a parametric filter policy.
+   *
+   * @param hash Hash function used to generate fingerprints.
+   */
+  __host__ __device__ constexpr parametric_filter_policy(Hash hash = {}) : hash_{hash}
+  {
+    static_assert(pattern_bits >= min_pattern_bits,
+                  "pattern_bits must be at least words_per_block");
+    static_assert(pattern_bits <= max_pattern_bits,
+                  "pattern_bits must be less than the total number of bits in a filter block");
+    // Require exact tiling. With `words_per_block` a power of two, this is equivalent to requiring
+    // both `add_horizontal_layout` and `add_vertical_layout` to be powers of two with product
+    // <= `words_per_block`. The internal loop count uses integer division on the product; non-
+    // dividing layouts would leave trailing words uninserted on add while contains still expects
+    // non-zero patterns there, producing false negatives for every inserted key.
+    static_assert(words_per_block % (add_horizontal_layout * add_vertical_layout) == 0,
+                  "add_horizontal_layout * add_vertical_layout must evenly divide words_per_block");
+    static_assert(
+      words_per_block % (contains_horizontal_layout * contains_vertical_layout) == 0,
+      "contains_horizontal_layout * contains_vertical_layout must evenly divide words_per_block");
+    // The split_hash() design requires a 64-bit hash split into upper 32 bits (block selection
+    // via multiply-shift) and lower 32 bits (pattern generation via salt-based multiplicative
+    // hashing). This is a permanent design requirement, not a temporary limitation.
+    static_assert(cuda::std::is_same_v<hash_result_type, uint64_t>,
+                  "parametric_filter_policy requires a 64-bit hash function");
+  }
+
+  /**
+   * @brief Splits the 64-bit hash of a key into its upper and lower 32 bits.
+   *
+   * The upper half is used for block selection (via multiply-shift); the lower half drives the
+   * per-word fingerprint pattern via salt-based multiplicative hashing.
+   *
+   * @param key Key to hash.
+   *
+   * @return `{upper 32 bits, lower 32 bits}` of the 64-bit hash.
+   */
+  __device__ constexpr cuda::std::pair<uint32_t, uint32_t> split_hash(hash_argument_type key) const
+  {
+    auto const hash_value = hash_(key);
+    return {static_cast<uint32_t>(hash_value >> 32), static_cast<uint32_t>(hash_value)};
+  }
+
+  /**
+   * @brief Determines the filter block a key maps to via fast multiply-shift modulo.
+   *
+   * @tparam Extent Size type used to determine the number of blocks in the filter.
+   *
+   * @param upper_hash_value Upper 32 bits of the key's hash.
+   * @param num_blocks Number of blocks in the filter.
+   *
+   * @return Block index in `[0, num_blocks)`.
+   */
+  template <class Extent>
+  __device__ constexpr auto block_index(uint32_t upper_hash_value, Extent num_blocks) const
+  {
+    return static_cast<uint32_t>((static_cast<uint64_t>(upper_hash_value) *
+                                  static_cast<typename Extent::value_type>(num_blocks)) >>
+                                 32);
+  }
+
+  /**
+   * @brief Generates the per-word fingerprint pattern for a key when the horizontal layout is 1.
+   *
+   * @tparam LoopIndex Outer-loop iteration index when `words_per_block / VerticalLayout > 1`.
+   * @tparam VerticalLayout Number of contiguous words this call produces.
+   *
+   * @param lower_hash_value Lower 32 bits of the key's hash.
+   *
+   * @return Array of `VerticalLayout` words.
+   */
+  template <uint32_t LoopIndex, uint32_t VerticalLayout>
+  __device__ constexpr auto array_pattern(uint32_t lower_hash_value) const
+  {
+    return pattern_impl<LoopIndex, VerticalLayout>(lower_hash_value);
+  }
+
+  /**
+   * @brief Generates the per-word fingerprint pattern for a key when the horizontal layout is > 1.
+   *
+   * @tparam LoopIndex Outer-loop iteration index.
+   * @tparam HorizontalLayout Cooperative-group size cooperating on a single key.
+   * @tparam VerticalLayout Number of contiguous words this call produces.
+   *
+   * @param lower_hash_value Lower 32 bits of the key's hash.
+   * @param thread_index Caller's rank within the cooperative group.
+   *
+   * @return Array of `VerticalLayout` words owned by the calling thread.
+   */
+  template <uint32_t LoopIndex, uint32_t HorizontalLayout, uint32_t VerticalLayout>
+  __device__ constexpr auto array_pattern(uint32_t lower_hash_value, uint32_t thread_index) const
+  {
+    return pattern_impl<LoopIndex, HorizontalLayout, VerticalLayout>(lower_hash_value,
+                                                                     thread_index);
+  }
+
+ private:
+  hasher hash_;
+
+  /**
+   * @brief pattern_impl - Computes the bit pattern for a vertical layout of words.
+   * I use the terminology of a `virtual thread` to refer to an ordering of the vertical layouts,
+   * namely
+   *   virtual_thread_index = LoopIndex * HorizontalLayout + thread_index,
+   * where LoopIndex is the index of the outermost loop in the range:
+   *     [0, words_per_block / (HorizontalLayout * VerticalLayout)).
+   * @param hash
+   * @return cuda::std::array<word_type, VerticalLayout> - The bit pattern for the vertical layout
+   * defined by the LoopIndex.
+   */
+
+  // Precondition: <add/contains>_horizontal_layout == 1
+  template <uint32_t LoopIndex, uint32_t VerticalLayout>
+  __device__ constexpr auto pattern_impl(uint32_t hash) const
+  {
+    using pattern_array_t = cuda::std::array<word_type, VerticalLayout>;
+
+    // Sanity check
+    constexpr uint32_t num_iterations = words_per_block / VerticalLayout;
+    static_assert(LoopIndex < num_iterations,
+                  "the loop index cannot exceed the number of loop iterations");
+
+    pattern_array_t pattern_array{0};
+    constexpr uint32_t salt_start_index = max_bits_per_word * VerticalLayout * LoopIndex;
+    constexpr uint32_t salt_end_index =
+      cuda::std::min(salt_start_index + max_bits_per_word * VerticalLayout, pattern_bits);
+    constexpr uint32_t pattern_array_start_index = 0;
+    set_bits<salt_start_index, salt_end_index, pattern_array_start_index>(hash, pattern_array);
+    return pattern_array;
+  }
+
+  // Precondition: <add/contains>_horizontal_layout > 1
+  template <uint32_t LoopIndex, uint32_t HorizontalLayout, uint32_t VerticalLayout>
+  __device__ constexpr auto pattern_impl(uint32_t hash, uint32_t thread_index) const
+  {
+    using pattern_array_t = cuda::std::array<word_type, VerticalLayout>;
+
+    // Sanity check
+    constexpr uint32_t num_iterations = words_per_block / (HorizontalLayout * VerticalLayout);
+    static_assert(LoopIndex < num_iterations,
+                  "the loop index cannot exceed the number of loop iterations");
+
+    // [lower_bound, upper_bound) defines the range of virtual thread indices for this loop
+    // iteration.
+    constexpr uint32_t lower_bound = LoopIndex * HorizontalLayout;
+    constexpr uint32_t upper_bound = lower_bound + HorizontalLayout;
+
+    // A virtual thread flips max_bits_per_virtual_thread bits in the pattern array, excepting
+    // potentially some of the last virtual threads (if pattern_bits % words_per_block != 0).
+    constexpr uint32_t max_bits_per_virtual_thread = max_bits_per_word * VerticalLayout;
+
+    pattern_array_t pattern_array{0};
+    if constexpr (num_iterations == 1) {
+      thread_dispatch<max_bits_per_virtual_thread, lower_bound, upper_bound>(
+        hash, thread_index, pattern_array);
+    } else {
+      uint32_t const virtual_thread_index = LoopIndex * HorizontalLayout + thread_index;
+      thread_dispatch<max_bits_per_virtual_thread, lower_bound, upper_bound>(
+        hash, virtual_thread_index, pattern_array);
+    }
+    return pattern_array;
+  }
+
+  // Dispatches a dynamic virtual thread index to a static virtual thread index by building a
+  // compile-time decision tree over the range [LowerBound, UpperBound) for the virtual thread
+  // index. This method is only used when <add/contains>_horizontal_layout > 1.
+  template <uint32_t MaxBitsPerVirtualThread,
+            uint32_t LowerBound,
+            uint32_t UpperBound,
+            class PatternArrayT>
+  __device__ constexpr void thread_dispatch(uint32_t hash,
+                                            uint32_t thread_index,
+                                            PatternArrayT& pattern_array) const
+  {
+    // Sanity check
+    static_assert(LowerBound < UpperBound);
+
+    if constexpr (LowerBound + 1 == UpperBound) {
+      // Base case: thread_index == LowerBound
+      constexpr uint32_t salt_start_index = MaxBitsPerVirtualThread * LowerBound;
+      constexpr uint32_t salt_end_index =
+        cuda::std::min(salt_start_index + MaxBitsPerVirtualThread, pattern_bits);
+      constexpr uint32_t pattern_array_start_index = 0;
+      set_bits<salt_start_index, salt_end_index, pattern_array_start_index>(hash, pattern_array);
+    } else {
+      // Recursive case: thread_index > LowerBound
+      constexpr uint32_t mid = (LowerBound + UpperBound) / 2;
+      if (thread_index < mid) {
+        thread_dispatch<MaxBitsPerVirtualThread, LowerBound, mid>(
+          hash, thread_index, pattern_array);
+      } else {
+        thread_dispatch<MaxBitsPerVirtualThread, mid, UpperBound>(
+          hash, thread_index, pattern_array);
+      }
+    }
+  }
+
+  // Set bits in the pattern array using salts starting from SaltIndex.
+  template <uint32_t SaltIndex,
+            uint32_t SaltEndIndex,
+            uint32_t PatternArrayIndex,
+            class PatternArrayT>
+  __device__ constexpr void set_bits(uint32_t hash, PatternArrayT& pattern_array) const
+  {
+    if constexpr (SaltIndex < SaltEndIndex) {
+      // Select top bit_index_width bits from salted hash to determine the bit index.
+      uint32_t const bit_index =
+        (cuda::std::get<SaltIndex>(salts) * hash) >> (32 - bit_index_width);
+
+      // Set the bit in the pattern array.
+      cuda::std::get<PatternArrayIndex>(pattern_array) |= word_type{1} << bit_index;
+
+      // Recurse.
+      constexpr uint32_t next_salt_index = SaltIndex + 1;
+      constexpr uint32_t next_pattern_array_index =
+        PatternArrayIndex + (next_salt_index % max_bits_per_word == 0 ? 1 : 0);
+      set_bits<next_salt_index, SaltEndIndex, next_pattern_array_index>(hash, pattern_array);
+    }
+  }
+};
+
+}  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/utility/cuda.hpp b/include/cuco/detail/utility/cuda.hpp
index ae2613f21..5d6342427 100644
--- a/include/cuco/detail/utility/cuda.hpp
+++ b/include/cuco/detail/utility/cuda.hpp
@@ -75,5 +75,16 @@ constexpr auto max_occupancy_grid_size(std::int32_t block_size,
   return max_active_blocks_per_multiprocessor * num_multiprocessors;
 }
 
+inline auto l2_cache_size()
+{
+  int device = 0;
+  CUCO_CUDA_TRY(cudaGetDevice(&device));
+
+  int l2_cache_size = -1;
+  CUCO_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_size, cudaDevAttrL2CacheSize, device));
+
+  return l2_cache_size;
+}
+
 }  // namespace detail
 }  // namespace cuco
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 51d4f42c1..09c31ee35 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -154,9 +154,11 @@ ConfigureTest(HYPERLOGLOG_TEST
 # - bloom_filter ----------------------------------------------------------------------------------
 ConfigureTest(BLOOM_FILTER_TEST
     bloom_filter/unique_sequence_test.cu
-    bloom_filter/arrow_policy_test.cu
     bloom_filter/variable_cg_test.cu
-    bloom_filter/merge_intersect_test.cu)
+    bloom_filter/merge_intersect_test.cu
+    bloom_filter/arrow_compat_test.cu
+    bloom_filter/bulk_ref_equivalence_test.cu
+    bloom_filter/layout_equivalence_test.cu)
 
 ###################################################################################################
 # - roaring_bitmap ---------------------------------------------------------------------------------
diff --git a/tests/bloom_filter/arrow_policy_test.cu b/tests/bloom_filter/arrow_compat_test.cu
similarity index 82%
rename from tests/bloom_filter/arrow_policy_test.cu
rename to tests/bloom_filter/arrow_compat_test.cu
index 1ca349162..2e820fde2 100644
--- a/tests/bloom_filter/arrow_policy_test.cu
+++ b/tests/bloom_filter/arrow_compat_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,18 +14,23 @@
  * limitations under the License.
  */
 
+// Verifies that a `parametric_filter_policy` instantiation with Apache Arrow's Block-Split Bloom
+// Filter parameters (256-bit blocks of 8 x uint32_t, 8 fingerprint bits per key, fully horizontal
+// add, fully vertical contains) produces byte-identical bitsets to a precomputed Arrow reference.
+
 #include <test_utils.hpp>
 
 #include <cuco/bloom_filter.cuh>
 
 #include <cuda/functional>
 #include <thrust/device_vector.h>
-#include <thrust/functional.h>
 
 #include <catch2/catch_template_test_macros.hpp>
 
-#include <random>
+#include <numeric>
 #include <type_traits>
+#include <utility>
+#include <vector>
 
 namespace {
 
@@ -73,11 +78,11 @@ thrust::device_vector<uint32_t> get_arrow_filter_reference_bitset()
   };
 
   if constexpr (std::is_same_v<Key, int32_t>) {
-    return reference_bitsets[0];  // int32
+    return reference_bitsets[0];
   } else if constexpr (std::is_same_v<Key, int64_t>) {
-    return reference_bitsets[1];  // int64
+    return reference_bitsets[1];
   } else if constexpr (std::is_same_v<Key, float>) {
-    return reference_bitsets[2];  // float
+    return reference_bitsets[2];
   } else {
     throw std::invalid_argument("Reference bitsets available for int32, int64, float only.\n\n");
   }
@@ -93,11 +98,11 @@ std::pair<size_t, size_t> get_arrow_filter_test_settings()
   };
 
   if constexpr (std::is_same_v<Key, int32_t>) {
-    return test_settings[0];  // int32
+    return test_settings[0];
   } else if constexpr (std::is_same_v<Key, int64_t>) {
-    return test_settings[1];  // int64
+    return test_settings[1];
   } else if constexpr (std::is_same_v<Key, float>) {
-    return test_settings[2];  // float
+    return test_settings[2];
   } else {
     throw std::invalid_argument("Test settings available for int32, int64, float only.\n\n");
   }
@@ -119,20 +124,14 @@ void test_filter_bitset(Filter& filter, size_t num_keys)
   using key_type  = typename Filter::key_type;
   using word_type = typename Filter::word_type;
 
-  // Generate keys
   auto const h_keys = sequence_values<key_type>(num_keys);
   thrust::device_vector<key_type> d_keys(h_keys.begin(), h_keys.end());
 
-  // Insert to the bloom filter
   filter.add(d_keys.begin(), d_keys.begin() + num_keys);
 
-  // Get reference words device_vector
   auto const reference_words = get_arrow_filter_reference_bitset<key_type>();
+  auto const num_words       = filter.block_extent() * filter.words_per_block;
 
-  // Number of words in the filter
-  auto const num_words = filter.block_extent() * filter.words_per_block;
-
-  // Get the bitset
   thrust::device_vector<word_type> filter_words(filter.data(), filter.data() + num_words);
 
   REQUIRE(cuco::test::equal(
@@ -144,17 +143,19 @@ void test_filter_bitset(Filter& filter, size_t num_keys)
     })));
 }
 
-TEMPLATE_TEST_CASE_SIG("bloom_filter arrow filter policy bitset validation",
+TEMPLATE_TEST_CASE_SIG("bloom_filter arrow-compatible parametric policy bitset validation",
                        "",
                        (class Key),
                        (int32_t),
                        (int64_t),
                        (float))
 {
-  // Get test settings
   auto const [sub_filters, num_keys] = get_arrow_filter_test_settings<Key>();
 
-  using policy_type = cuco::arrow_filter_policy<Key>;
+  // Apache Arrow Block-Split Bloom Filter parameters: 256-bit blocks (8 x uint32_t), 8 fingerprint
+  // bits per key, fully horizontal add (Theta=8) and fully vertical contains (Phi=8).
+  using policy_type =
+    cuco::parametric_filter_policy<cuco::xxhash_64<Key>, std::uint32_t, 8, 8, 8, 1, 1, 8>;
   cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, policy_type> filter{
     sub_filters};
 
diff --git a/tests/bloom_filter/bulk_ref_equivalence_test.cu b/tests/bloom_filter/bulk_ref_equivalence_test.cu
new file mode 100644
index 000000000..9206563f8
--- /dev/null
+++ b/tests/bloom_filter/bulk_ref_equivalence_test.cu
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Host-bulk vs device-ref equivalence. The host-bulk APIs route through `add_n` /
+// `contains_n` in `kernels.cuh`; the device-ref APIs route through scalar / CG / CG-range
+// methods on `bloom_filter_ref` directly. Both paths share `policy_.array_pattern(...)`,
+// so given the same input keys both must produce byte-identical filter bitsets (add side)
+// and byte-identical query results (contains side). Catches regressions like the
+// `add_coop(group, first, idx, is_valid)` iterator-formation UB fix where one path
+// silently diverged from the other.
+
+#include <test_utils.hpp>
+
+#include <cuco/bloom_filter.cuh>
+#include <cuco/detail/error.hpp>
+
+#include <cuda/functional>
+#include <cuda/std/functional>
+#include <thrust/device_vector.h>
+#include <thrust/equal.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+
+#include <cooperative_groups.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace cg = cooperative_groups;
+
+using size_type = int32_t;
+
+template <class Ref, class Key>
+__global__ void scalar_add_kernel(Ref ref, Key const* keys, size_type n)
+{
+  auto const i = static_cast<size_type>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (i < n) { ref.add(keys[i]); }
+}
+
+template <int CGSize, class Ref, class Key>
+__global__ void cg_add_kernel(Ref ref, Key const* keys, size_type n)
+{
+  auto const block = cg::this_thread_block();
+  auto const tile  = cg::tiled_partition<CGSize>(block);
+  auto const idx   = static_cast<size_type>((blockIdx.x * blockDim.x + threadIdx.x) / CGSize);
+  if (idx < n) { ref.add(tile, keys[idx]); }
+}
+
+template <int CGSize, class Ref, class Key>
+__global__ void cg_range_add_kernel(Ref ref, Key const* first, Key const* last)
+{
+  auto const block = cg::this_thread_block();
+  auto const tile  = cg::tiled_partition<CGSize>(block);
+  ref.add(tile, first, last);
+}
+
+template <class Ref, class Key>
+__global__ void scalar_contains_kernel(Ref ref, Key const* keys, size_type n, bool* out)
+{
+  auto const i = static_cast<size_type>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (i < n) { out[i] = ref.contains(keys[i]); }
+}
+
+template <int CGSize, class Ref, class Key>
+__global__ void cg_contains_kernel(Ref ref, Key const* keys, size_type n, bool* out)
+{
+  auto const block = cg::this_thread_block();
+  auto const tile  = cg::tiled_partition<CGSize>(block);
+  auto const idx   = static_cast<size_type>((blockIdx.x * blockDim.x + threadIdx.x) / CGSize);
+  if (idx < n) {
+    auto const found = ref.contains(tile, keys[idx]);
+    if (tile.thread_rank() == 0) { out[idx] = found; }
+  }
+}
+
+template <int CGSize, class Ref, class Key>
+__global__ void cg_range_contains_kernel(Ref ref, Key const* first, Key const* last, bool* out)
+{
+  auto const block = cg::this_thread_block();
+  auto const tile  = cg::tiled_partition<CGSize>(block);
+  ref.contains(tile, first, last, out);
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "bloom_filter: host bulk add equals device ref add",
+  "",
+  ((class Key, class Policy), Key, Policy),
+  (int32_t, cuco::default_filter_policy<int32_t>),
+  (int32_t, cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1, 1, 1, 1, 1, 1>),
+  (uint64_t,
+   cuco::parametric_filter_policy<cuco::xxhash_64<uint64_t>, uint32_t, 8, 12, 8, 1, 4, 2>),
+  (float, cuco::parametric_filter_policy<cuco::xxhash_64<float>, uint64_t, 4, 4, 2, 2, 1, 2>),
+  (int32_t, cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 2, 2, 1, 8>))
+{
+  using filter_type =
+    cuco::bloom_filter<Key, cuco::extent<std::size_t>, cuda::thread_scope_device, Policy>;
+  // Prime: forces a partial boundary tile (mix of valid + invalid lanes) on every
+  // CGSize > 1 path, exercising the cooperative is_valid mask logic.
+  constexpr size_type num_keys = 397;
+
+  thrust::device_vector<Key> keys(num_keys);
+  thrust::sequence(thrust::device, keys.begin(), keys.end());
+  auto const keys_raw = thrust::raw_pointer_cast(keys.data());
+
+  SECTION("scalar ref.add")
+  {
+    auto filter_a = filter_type{1000};
+    auto filter_b = filter_type{1000};
+
+    filter_a.add(keys.begin(), keys.end());
+
+    auto ref                 = filter_b.ref();
+    constexpr int block_size = 128;
+    int const grid_size      = (num_keys + block_size - 1) / block_size;
+    scalar_add_kernel<<<grid_size, block_size>>>(ref, keys_raw, num_keys);
+    CUCO_CUDA_TRY(cudaDeviceSynchronize());
+
+    auto const total_words =
+      static_cast<std::size_t>(filter_a.block_extent()) * filter_type::words_per_block;
+    REQUIRE(thrust::equal(
+      thrust::device, filter_a.data(), filter_a.data() + total_words, filter_b.data()));
+  }
+
+  SECTION("CG ref.add(group, key)")
+  {
+    constexpr int CGSize = Policy::add_horizontal_layout;
+    auto filter_a        = filter_type{1000};
+    auto filter_b        = filter_type{1000};
+
+    filter_a.add(keys.begin(), keys.end());
+
+    auto ref                 = filter_b.ref();
+    constexpr int block_size = 128;
+    int const grid_size      = (num_keys * CGSize + block_size - 1) / block_size;
+    cg_add_kernel<CGSize><<<grid_size, block_size>>>(ref, keys_raw, num_keys);
+    CUCO_CUDA_TRY(cudaDeviceSynchronize());
+
+    auto const total_words =
+      static_cast<std::size_t>(filter_a.block_extent()) * filter_type::words_per_block;
+    REQUIRE(thrust::equal(
+      thrust::device, filter_a.data(), filter_a.data() + total_words, filter_b.data()));
+  }
+
+  SECTION("CG ref.add(group, first, last)")
+  {
+    constexpr int CGSize = Policy::add_horizontal_layout;
+    auto filter_a        = filter_type{1000};
+    auto filter_b        = filter_type{1000};
+
+    filter_a.add(keys.begin(), keys.end());
+
+    auto ref = filter_b.ref();
+    // Single tile processes the entire range cooperatively.
+    cg_range_add_kernel<CGSize><<<1, CGSize>>>(ref, keys_raw, keys_raw + num_keys);
+    CUCO_CUDA_TRY(cudaDeviceSynchronize());
+
+    auto const total_words =
+      static_cast<std::size_t>(filter_a.block_extent()) * filter_type::words_per_block;
+    REQUIRE(thrust::equal(
+      thrust::device, filter_a.data(), filter_a.data() + total_words, filter_b.data()));
+  }
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "bloom_filter: host bulk contains equals device ref contains",
+  "",
+  ((class Key, class Policy), Key, Policy),
+  (int32_t, cuco::default_filter_policy<int32_t>),
+  (int32_t, cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1, 1, 1, 1, 1, 1>),
+  (uint64_t,
+   cuco::parametric_filter_policy<cuco::xxhash_64<uint64_t>, uint32_t, 8, 12, 8, 1, 4, 2>),
+  (float, cuco::parametric_filter_policy<cuco::xxhash_64<float>, uint64_t, 4, 4, 2, 2, 1, 2>),
+  (int32_t, cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 2, 2, 1, 8>))
+{
+  using filter_type =
+    cuco::bloom_filter<Key, cuco::extent<std::size_t>, cuda::thread_scope_device, Policy>;
+  // Primes: force partial boundary tiles on both insert and probe ranges across every
+  // CGSize > 1 path.
+  constexpr size_type num_keys  = 397;
+  constexpr size_type num_probe = 797;  // mix of inserted (first ~half) and non-inserted (rest)
+
+  auto filter = filter_type{1000};
+
+  thrust::device_vector<Key> insert_keys(num_keys);
+  thrust::sequence(thrust::device, insert_keys.begin(), insert_keys.end());
+  filter.add(insert_keys.begin(), insert_keys.end());
+
+  thrust::device_vector<Key> probe_keys(num_probe);
+  thrust::sequence(thrust::device, probe_keys.begin(), probe_keys.end());
+  auto const probe_raw = thrust::raw_pointer_cast(probe_keys.data());
+
+  thrust::device_vector<bool> bulk_result(num_probe);
+  filter.contains(probe_keys.begin(), probe_keys.end(), bulk_result.begin());
+
+  SECTION("scalar ref.contains")
+  {
+    thrust::device_vector<bool> ref_result(num_probe);
+    auto ref                 = filter.ref();
+    constexpr int block_size = 128;
+    int const grid_size      = (num_probe + block_size - 1) / block_size;
+    scalar_contains_kernel<<<grid_size, block_size>>>(
+      ref, probe_raw, num_probe, thrust::raw_pointer_cast(ref_result.data()));
+    CUCO_CUDA_TRY(cudaDeviceSynchronize());
+
+    REQUIRE(cuco::test::equal(
+      bulk_result.begin(), bulk_result.end(), ref_result.begin(), cuda::std::equal_to<bool>{}));
+  }
+
+  SECTION("CG ref.contains(group, key)")
+  {
+    constexpr int CGSize = Policy::contains_horizontal_layout;
+    thrust::device_vector<bool> ref_result(num_probe);
+    auto ref                 = filter.ref();
+    constexpr int block_size = 128;
+    int const grid_size      = (num_probe * CGSize + block_size - 1) / block_size;
+    cg_contains_kernel<CGSize><<<grid_size, block_size>>>(
+      ref, probe_raw, num_probe, thrust::raw_pointer_cast(ref_result.data()));
+    CUCO_CUDA_TRY(cudaDeviceSynchronize());
+
+    REQUIRE(cuco::test::equal(
+      bulk_result.begin(), bulk_result.end(), ref_result.begin(), cuda::std::equal_to<bool>{}));
+  }
+
+  SECTION("device-range CG ref.contains(group, first, last, out)")
+  {
+    constexpr int CGSize = Policy::contains_horizontal_layout;
+    thrust::device_vector<bool> ref_result(num_probe);
+    auto ref = filter.ref();
+    // Single tile processes the entire range cooperatively.
+    cg_range_contains_kernel<CGSize><<<1, CGSize>>>(
+      ref, probe_raw, probe_raw + num_probe, thrust::raw_pointer_cast(ref_result.data()));
+    CUCO_CUDA_TRY(cudaDeviceSynchronize());
+
+    REQUIRE(cuco::test::equal(
+      bulk_result.begin(), bulk_result.end(), ref_result.begin(), cuda::std::equal_to<bool>{}));
+  }
+}
diff --git a/tests/bloom_filter/layout_equivalence_test.cu b/tests/bloom_filter/layout_equivalence_test.cu
new file mode 100644
index 000000000..49b8774f4
--- /dev/null
+++ b/tests/bloom_filter/layout_equivalence_test.cu
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Byte-equal bitsets across (AddH, AddV) layout permutations, identical contains results
+// across (ContainsH, ContainsV) layout permutations, and equivalence between dynamic vs
+// static `cuco::extent` -- all for fixed (Hash, Word, WordsPerBlock, PatternBits, keys).
+
+#include <test_utils.hpp>
+
+#include <cuco/bloom_filter.cuh>
+#include <cuco/detail/error.hpp>
+
+#include <thrust/device_vector.h>
+#include <thrust/equal.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include <cstddef>
+#include <cstdint>
+
+TEMPLATE_TEST_CASE_SIG(
+  "bloom_filter: bitset is invariant under (AddH, AddV) layout permutations",
+  "",
+  ((class AltPolicy), AltPolicy),
+  (cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 1, 8, 1, 8>),
+  (cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 2, 4, 1, 8>),
+  (cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 4, 2, 1, 8>),
+  (cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 2, 2, 1, 8>),
+  (cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 4, 1, 1, 8>))
+{
+  using Key            = int32_t;
+  using default_policy = cuco::default_filter_policy<Key>;
+  using filter_default_t =
+    cuco::bloom_filter<Key, cuco::extent<std::size_t>, cuda::thread_scope_device, default_policy>;
+  using filter_alt_t =
+    cuco::bloom_filter<Key, cuco::extent<std::size_t>, cuda::thread_scope_device, AltPolicy>;
+
+  constexpr int32_t num_blocks = 1'000;
+  constexpr int32_t num_keys   = 400;
+
+  auto filter_default = filter_default_t{num_blocks};
+  auto filter_alt     = filter_alt_t{num_blocks};
+
+  thrust::device_vector<Key> keys(num_keys);
+  thrust::sequence(thrust::device, keys.begin(), keys.end());
+
+  filter_default.add(keys.begin(), keys.end());
+  filter_alt.add(keys.begin(), keys.end());
+
+  auto const total_words =
+    static_cast<std::size_t>(filter_default.block_extent()) * filter_default_t::words_per_block;
+  REQUIRE(thrust::equal(
+    thrust::device, filter_default.data(), filter_default.data() + total_words, filter_alt.data()));
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "bloom_filter: contains results are invariant under (ContainsH, ContainsV) permutations",
+  "",
+  ((class AltPolicy), AltPolicy),
+  (cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 8, 1, 8, 1>),
+  (cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 8, 1, 2, 4>),
+  (cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 8, 1, 4, 2>),
+  (cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 8, 1, 2, 2>),
+  (cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 8, 1, 1, 4>))
+{
+  using Key            = int32_t;
+  using default_policy = cuco::default_filter_policy<Key>;
+  using filter_default_t =
+    cuco::bloom_filter<Key, cuco::extent<std::size_t>, cuda::thread_scope_device, default_policy>;
+  using filter_alt_t =
+    cuco::bloom_filter<Key, cuco::extent<std::size_t>, cuda::thread_scope_device, AltPolicy>;
+
+  constexpr int32_t num_blocks = 1'000;
+  constexpr int32_t num_keys   = 400;
+  constexpr int32_t num_probe  = 800;  // mix of inserted and disjoint
+
+  auto filter_default = filter_default_t{num_blocks};
+  auto filter_alt     = filter_alt_t{num_blocks};
+
+  thrust::device_vector<Key> insert_keys(num_keys);
+  thrust::sequence(thrust::device, insert_keys.begin(), insert_keys.end());
+  filter_default.add(insert_keys.begin(), insert_keys.end());
+  filter_alt.add(insert_keys.begin(), insert_keys.end());
+
+  thrust::device_vector<Key> probe_keys(num_probe);
+  thrust::sequence(thrust::device, probe_keys.begin(), probe_keys.end());
+
+  thrust::device_vector<bool> result_default(num_probe);
+  thrust::device_vector<bool> result_alt(num_probe);
+  filter_default.contains(probe_keys.begin(), probe_keys.end(), result_default.begin());
+  filter_alt.contains(probe_keys.begin(), probe_keys.end(), result_alt.begin());
+
+  REQUIRE(thrust::equal(
+    thrust::device, result_default.begin(), result_default.end(), result_alt.begin()));
+}
+
+TEST_CASE("bloom_filter: bitset is invariant under dynamic vs static cuco::extent", "")
+{
+  using Key                        = int32_t;
+  using Policy                     = cuco::default_filter_policy<Key>;
+  constexpr std::size_t num_blocks = 1'000;
+  constexpr int32_t num_keys       = 400;
+
+  using dynamic_extent_t = cuco::extent<std::size_t>;
+  using static_extent_t  = cuco::extent<std::size_t, num_blocks>;
+  using filter_dynamic_t =
+    cuco::bloom_filter<Key, dynamic_extent_t, cuda::thread_scope_device, Policy>;
+  using filter_static_t =
+    cuco::bloom_filter<Key, static_extent_t, cuda::thread_scope_device, Policy>;
+
+  auto filter_dynamic = filter_dynamic_t{num_blocks};
+  auto filter_static  = filter_static_t{static_extent_t{}};
+
+  thrust::device_vector<Key> keys(num_keys);
+  thrust::sequence(thrust::device, keys.begin(), keys.end());
+
+  filter_dynamic.add(keys.begin(), keys.end());
+  filter_static.add(keys.begin(), keys.end());
+
+  auto const total_words = num_blocks * filter_dynamic_t::words_per_block;
+  REQUIRE(thrust::equal(thrust::device,
+                        filter_dynamic.data(),
+                        filter_dynamic.data() + total_words,
+                        filter_static.data()));
+}
diff --git a/tests/bloom_filter/merge_intersect_test.cu b/tests/bloom_filter/merge_intersect_test.cu
index 5638d58a8..a83c3bcb2 100644
--- a/tests/bloom_filter/merge_intersect_test.cu
+++ b/tests/bloom_filter/merge_intersect_test.cu
@@ -162,42 +162,16 @@ TEMPLATE_TEST_CASE_SIG(
   "bloom_filter merge and intersect tests",
   "",
   ((class Key, class Policy), Key, Policy),
-  (int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1>),
-  (int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8>),
-  (int64_t, cuco::default_filter_policy<cuco::xxhash_64<int64_t>, uint64_t, 1>),
-  (int64_t, cuco::default_filter_policy<cuco::xxhash_64<int64_t>, uint64_t, 8>))
+  (int32_t, cuco::default_filter_policy<int32_t>),
+  (int32_t, cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1, 1, 1, 1, 1, 1>),
+  (int32_t, cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 8, 1, 1, 8>),
+  (int64_t, cuco::parametric_filter_policy<cuco::xxhash_64<int64_t>, uint64_t, 1, 1, 1, 1, 1, 1>),
+  (int64_t, cuco::parametric_filter_policy<cuco::xxhash_64<int64_t>, uint64_t, 8, 8, 8, 1, 1, 8>))
 {
   using filter_type =
     cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, Policy>;
   constexpr size_type capacity{1000};
 
-  uint32_t pattern_bits = Policy::words_per_block + GENERATE(0, 1);
-
-  // some parameter combinations might be invalid so we skip them
-  try {
-    [[maybe_unused]] auto policy = Policy{pattern_bits};
-  } catch (std::exception const& e) {
-    SKIP(e.what());
-  }
-
-  auto filter_a = filter_type{capacity, {}, {pattern_bits}};
-  auto filter_b = filter_type{capacity, {}, {pattern_bits}};
-  auto filter_c = filter_type{static_cast<size_t>(capacity) * 2, {}, {pattern_bits}};
-
-  test_merge_intersect(filter_a, filter_b, filter_c, capacity);
-}
-
-TEMPLATE_TEST_CASE_SIG("bloom_filter merge and intersect arrow tests",
-                       "",
-                       ((class Key, class Policy), Key, Policy),
-                       (int32_t, cuco::arrow_filter_policy<int32_t>),
-                       (int64_t, cuco::arrow_filter_policy<int64_t>),
-                       (float, cuco::arrow_filter_policy<float>))
-{
-  using filter_type =
-    cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, Policy>;
-  constexpr size_type capacity{1000};  // Must match capacity used in helper logic
-
   auto filter_a = filter_type{capacity};
   auto filter_b = filter_type{capacity};
   auto filter_c = filter_type{static_cast<size_t>(capacity) * 2};
diff --git a/tests/bloom_filter/unique_sequence_test.cu b/tests/bloom_filter/unique_sequence_test.cu
index 4cf24563c..0c212f755 100644
--- a/tests/bloom_filter/unique_sequence_test.cu
+++ b/tests/bloom_filter/unique_sequence_test.cu
@@ -86,38 +86,15 @@ void test_unique_sequence(Filter& filter, size_type num_keys)
 }
 
 TEMPLATE_TEST_CASE_SIG(
-  "bloom_filter default policy tests",
+  "bloom_filter parametric policy tests",
   "",
   ((class Key, class Policy), Key, Policy),
-  (int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1>),
-  (int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8>),
-  (int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint64_t, 1>),
-  (int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint64_t, 8>))
-{
-  using filter_type =
-    cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, Policy>;
-  constexpr size_type num_keys{400};
-
-  uint32_t pattern_bits = Policy::words_per_block + GENERATE(0, 1, 2, 3, 4);
-
-  // some parameter combinations might be invalid so we skip them
-  try {
-    [[maybe_unused]] auto policy = Policy{pattern_bits};
-  } catch (std::exception const& e) {
-    SKIP(e.what());
-  }
-
-  auto filter = filter_type{1000, {}, {pattern_bits}};
-
-  test_unique_sequence(filter, num_keys);
-}
-
-TEMPLATE_TEST_CASE_SIG("bloom_filter arrow policy tests",
-                       "",
-                       ((class Key, class Policy), Key, Policy),
-                       (int32_t, cuco::arrow_filter_policy<int32_t>),
-                       (uint64_t, cuco::arrow_filter_policy<uint64_t>),
-                       (float, cuco::arrow_filter_policy<float>))
+  (int32_t, cuco::default_filter_policy<int32_t>),
+  (int32_t, cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1, 1, 1, 1, 1, 1>),
+  (uint64_t,
+   cuco::parametric_filter_policy<cuco::xxhash_64<uint64_t>, uint32_t, 8, 12, 8, 1, 4, 2>),
+  (float, cuco::parametric_filter_policy<cuco::xxhash_64<float>, uint64_t, 4, 4, 2, 2, 1, 2>),
+  (int32_t, cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 2, 2, 1, 8>))
 {
   using filter_type =
     cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, Policy>;
diff --git a/tests/bloom_filter/variable_cg_test.cu b/tests/bloom_filter/variable_cg_test.cu
index adb6b703f..6a7a21d51 100644
--- a/tests/bloom_filter/variable_cg_test.cu
+++ b/tests/bloom_filter/variable_cg_test.cu
@@ -14,100 +14,177 @@
  * limitations under the License.
  */
 
+// Exercises `cuco::bloom_filter_ref` device-side APIs (`ref.add(key)`, `ref.contains(key)`,
+// `ref.contains(group, key)`) directly from custom kernels, varying the cooperative group size.
+// The bulk host-side APIs route through the warp-cooperative kernel which already had correct
+// CG-reduction semantics; the scalar and CG ref methods are separate code paths that need their
+// own coverage.
+
 #include <test_utils.hpp>
 
 #include <cuco/bloom_filter.cuh>
 
-#include <cuda/functional>
-#include <cuda/iterator>
+#include <cuda/std/functional>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
-#include <thrust/fill.h>
 #include <thrust/sequence.h>
 
+#include <cooperative_groups.h>
+
 #include <catch2/catch_template_test_macros.hpp>
-#include <catch2/generators/catch_generators.hpp>
 
 #include <cstdint>
-#include <exception>
 
 using size_type = int32_t;
 
-template <int32_t AddCGSize, int32_t ContainsCGSize, typename Filter>
-void test_variable_cg_size(Filter& filter, size_type num_keys)
+namespace cg = cooperative_groups;
+
+template <class Ref, class Key>
+__global__ void scalar_add_kernel(Ref ref, Key const* keys, size_type n)
 {
-  constexpr int32_t block_size = 128;
-  constexpr int32_t grid_size  = 128;
+  auto const i = static_cast<size_type>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (i < n) { ref.add(keys[i]); }
+}
 
-  using Key = typename Filter::key_type;
+template <class Ref, class Key>
+__global__ void scalar_contains_kernel(Ref ref, Key const* keys, size_type n, bool* out)
+{
+  auto const i = static_cast<size_type>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (i < n) { out[i] = ref.contains(keys[i]); }
+}
 
-  auto ref = filter.ref();
+template <int CGSize, class Ref, class Key>
+__global__ void cg_contains_consistency_kernel(Ref ref,
+                                               Key const* keys,
+                                               size_type n,
+                                               int* mismatches)
+{
+  auto const block = cg::this_thread_block();
+  auto const tile  = cg::tiled_partition<CGSize>(block);
+  auto const idx   = static_cast<size_type>((blockIdx.x * blockDim.x + threadIdx.x) / CGSize);
+  if (idx >= n) { return; }
+  bool const got       = ref.contains(tile, keys[idx]);
+  bool const all_agree = tile.all(got);
+  bool const any_agree = tile.any(got);
+  if (tile.thread_rank() == 0 && all_agree != any_agree) { atomicAdd(mismatches, 1); }
+}
+
+template <int CGSize, class Ref>
+__global__ void cooperative_clear_kernel(Ref ref)
+{
+  auto const block = cg::this_thread_block();
+  auto const tile  = cg::tiled_partition<CGSize>(block);
+  ref.clear(tile);
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "bloom_filter device ref scalar add and contains",
+  "",
+  ((class Key, class Policy), Key, Policy),
+  (int32_t, cuco::default_filter_policy<int32_t>),
+  (int32_t, cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1, 1, 1, 1, 1, 1>),
+  (int32_t, cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 4, 2, 4, 2>))
+{
+  using filter_type =
+    cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, Policy>;
+  constexpr size_type num_keys{400};
+
+  auto filter = filter_type{1000};
 
-  // Generate keys
   thrust::device_vector<Key> keys(num_keys);
   thrust::sequence(thrust::device, keys.begin(), keys.end());
-
   thrust::device_vector<bool> contained(num_keys, false);
 
-  auto const always_true = cuda::constant_iterator<bool>{true};
-
-  SECTION("Check if fallback kernels work for varying combinations of CG sizes.")
-  {
-    cuco::detail::bloom_filter_ns::add_if_n<AddCGSize, block_size>
-      <<<grid_size, block_size>>>(keys.begin(), num_keys, always_true, cuda::std::identity{}, ref);
-    cuco::detail::bloom_filter_ns::contains_if_n<ContainsCGSize, block_size>
-      <<<grid_size, block_size>>>(
-        keys.begin(), num_keys, always_true, cuda::std::identity{}, contained.begin(), ref);
-    REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{}));
-  }
-
-  filter.clear();
-  thrust::fill(contained.begin(), contained.end(), false);  // reset output vector
-
-  SECTION("Check if adaptive add kernel works with fallback contains kernel.")
-  {
-    cuco::detail::bloom_filter_ns::add<block_size>
-      <<<grid_size, block_size>>>(keys.begin(), num_keys, ref);
-    cuco::detail::bloom_filter_ns::contains_if_n<ContainsCGSize, block_size>
-      <<<grid_size, block_size>>>(
-        keys.begin(), num_keys, always_true, cuda::std::identity{}, contained.begin(), ref);
-    REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{}));
-  }
-
-  // TODO adaptive vs. adaptive and fallback add vs. adaptive contains (requires #673)
+  auto ref = filter.ref();
+
+  constexpr int block_size = 128;
+  int const grid_size      = (num_keys + block_size - 1) / block_size;
+
+  scalar_add_kernel<<<grid_size, block_size>>>(
+    ref, thrust::raw_pointer_cast(keys.data()), num_keys);
+  CUCO_CUDA_TRY(cudaDeviceSynchronize());
+
+  scalar_contains_kernel<<<grid_size, block_size>>>(ref,
+                                                    thrust::raw_pointer_cast(keys.data()),
+                                                    num_keys,
+                                                    thrust::raw_pointer_cast(contained.data()));
+  CUCO_CUDA_TRY(cudaDeviceSynchronize());
+
+  REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{}));
 }
 
 TEMPLATE_TEST_CASE_SIG(
-  "bloom_filter variable CG size tests",
+  "bloom_filter device ref CG contains is reduced across the group",
   "",
-  ((int32_t AddCGSize, int32_t ContainsCGSize, class Key, class Policy),
-   AddCGSize,
-   ContainsCGSize,
-   Key,
-   Policy),
-  (1, 4, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1>),
-  (1, 4, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8>),
-  (1, 4, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint64_t, 1>),
-  (1, 4, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint64_t, 8>),
-  (4, 1, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 1>),
-  (4, 1, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8>),
-  (4, 1, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint64_t, 1>),
-  (4, 1, int32_t, cuco::default_filter_policy<cuco::xxhash_64<int32_t>, uint64_t, 8>))
+  ((int32_t CGSize, class Key, class Policy), CGSize, Key, Policy),
+  (4,
+   int32_t,
+   cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 4, 2, 4, 2>),
+  (8,
+   int32_t,
+   cuco::parametric_filter_policy<cuco::xxhash_64<int32_t>, uint32_t, 8, 8, 8, 1, 8, 1>))
+{
+  using filter_type =
+    cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, Policy>;
+  // Small filter + many probes drive non-inserted keys into the partial-match regime where
+  // different lanes in the tile see different per-slice match results. Without a group reduction
+  // in `ref.contains(group, key)`, each lane returns its slice's partial result and the tile
+  // disagrees on the answer.
+  constexpr size_type num_inserted{200};
+  constexpr size_type num_probed{2000};
+  constexpr size_type num_blocks{16};
+
+  auto filter = filter_type{num_blocks};
+  auto ref    = filter.ref();
+
+  thrust::device_vector<Key> insert_keys(num_inserted);
+  thrust::sequence(thrust::device, insert_keys.begin(), insert_keys.end());
+  filter.add(insert_keys.begin(), insert_keys.end());
+
+  thrust::device_vector<Key> probe_keys(num_probed);
+  thrust::sequence(thrust::device, probe_keys.begin(), probe_keys.end());
+
+  thrust::device_vector<int> mismatches(1, 0);
+
+  constexpr int block_size = 128;
+  int const grid_size      = (num_probed * CGSize + block_size - 1) / block_size;
+
+  cg_contains_consistency_kernel<CGSize>
+    <<<grid_size, block_size>>>(ref,
+                                thrust::raw_pointer_cast(probe_keys.data()),
+                                num_probed,
+                                thrust::raw_pointer_cast(mismatches.data()));
+  CUCO_CUDA_TRY(cudaDeviceSynchronize());
+
+  REQUIRE(static_cast<int>(mismatches[0]) == 0);
+}
+
+TEMPLATE_TEST_CASE_SIG("bloom_filter device ref cooperative clear",
+                       "",
+                       ((int32_t CGSize, class Key, class Policy), CGSize, Key, Policy),
+                       (1, int32_t, cuco::default_filter_policy<int32_t>),
+                       (4, int32_t, cuco::default_filter_policy<int32_t>),
+                       (8, int32_t, cuco::default_filter_policy<int32_t>),
+                       (32, int32_t, cuco::default_filter_policy<int32_t>))
 {
   using filter_type =
     cuco::bloom_filter<Key, cuco::extent<size_t>, cuda::thread_scope_device, Policy>;
   constexpr size_type num_keys{400};
 
-  uint32_t pattern_bits = Policy::words_per_block + GENERATE(0, 1, 2, 3, 4);
+  auto filter = filter_type{1000};
 
-  // some parameter combinations might be invalid so we skip them
-  try {
-    [[maybe_unused]] auto policy = Policy{pattern_bits};
-  } catch (std::exception const& e) {
-    SKIP(e.what());
-  }
+  thrust::device_vector<Key> keys(num_keys);
+  thrust::sequence(thrust::device, keys.begin(), keys.end());
+  filter.add(keys.begin(), keys.end());
+
+  thrust::device_vector<bool> contained(num_keys, false);
+  filter.contains(keys.begin(), keys.end(), contained.begin());
+  REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), cuda::std::identity{}));
 
-  auto filter = filter_type{1000, {}, {pattern_bits}};
+  // Device cooperative clear via a single tile that iterates over all filter words.
+  cooperative_clear_kernel<CGSize><<<1, CGSize>>>(filter.ref());
+  CUCO_CUDA_TRY(cudaDeviceSynchronize());
 
-  test_variable_cg_size<AddCGSize, ContainsCGSize>(filter, num_keys);
+  filter.contains(keys.begin(), keys.end(), contained.begin());
+  REQUIRE(cuco::test::none_of(contained.begin(), contained.end(), cuda::std::identity{}));
 }