Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions include/cuco/bucket_storage.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2025, NVIDIA CORPORATION.
* Copyright (c) 2022-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -21,12 +21,12 @@
#include <cuco/utility/allocator.hpp>

#include <cuda/std/array>
#include <cuda/std/bit>
#include <cuda/std/functional>
#include <cuda/stream_ref>

#include <cstddef>
#include <cstdint>
#include <iterator>
#include <memory>

namespace cuco {
Expand All @@ -40,14 +40,14 @@ namespace cuco {
template <typename T, int32_t BucketSize, typename Extent = cuco::extent<std::size_t>>
class bucket_storage_ref {
public:
static constexpr int32_t bucket_size = BucketSize; ///< Number of elements processed per bucket
static constexpr std::size_t alignment =
cuda::std::min(sizeof(T) * bucket_size, std::size_t{16}); ///< Required alignment
static constexpr int32_t bucket_size = BucketSize; ///< Number of elements processed per bucket
using bucket_type = cuda::std::array<T, BucketSize>; ///< Slot bucket type
static constexpr std::size_t alignment = cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)),
std::size_t{16}); ///< Required alignment

using extent_type = Extent; ///< Storage extent type
using size_type = typename extent_type::value_type; ///< Storage size type
using value_type = T; ///< Slot type
using bucket_type = cuda::std::array<T, BucketSize>; ///< Slot bucket type

/**
* @brief Constructor of slot storage ref.
Expand Down Expand Up @@ -150,7 +150,7 @@ class bucket_storage {
using value_type = T; ///< Slot type
using bucket_type = cuda::std::array<T, BucketSize>; ///< Slot bucket type

/// Type of the allocator to (de)allocate buckets
/// Type of the allocator to (de)allocate slots
using allocator_type =
typename std::allocator_traits<Allocator>::template rebind_alloc<value_type>;
using ref_type = bucket_storage_ref<value_type, bucket_size, extent_type>; ///< Storage ref type
Expand Down Expand Up @@ -244,13 +244,19 @@ class bucket_storage {
[[nodiscard]] __host__ __device__ constexpr extent_type extent() const noexcept;

private:
using slot_deleter_type =
detail::custom_deleter<size_type, allocator_type>; ///< Type of slot deleter
struct aligned_deleter {
value_type* raw_ptr_;
std::size_t size_;
allocator_type& allocator_;
cuda::stream_ref stream_;

void operator()(value_type*) const { allocator_.deallocate(raw_ptr_, size_, stream_); }
};

extent_type extent_; ///< Storage extent
allocator_type allocator_; ///< Allocator used to (de)allocate slots
/// Pointer to the slot storage
std::unique_ptr<value_type, slot_deleter_type> slots_;
/// Pointer to the aligned slot storage
std::unique_ptr<value_type, aligned_deleter> slots_;
};
} // namespace cuco

Expand Down
18 changes: 13 additions & 5 deletions include/cuco/detail/storage/bucket_storage.inl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2025, NVIDIA CORPORATION.
* Copyright (c) 2022-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -23,9 +23,11 @@
#include <cub/device/device_for.cuh>
#include <cuda/std/array>
#include <cuda/std/bit>
#include <cuda/std/cstdint>
#include <cuda/stream_ref>

#include <cassert>
#include <memory>

namespace cuco {

Expand Down Expand Up @@ -98,10 +100,16 @@ bucket_storage_ref<T, BucketSize, Extent>::extent() const noexcept
template <typename T, int BucketSize, typename Extent, typename Allocator>
constexpr bucket_storage<T, BucketSize, Extent, Allocator>::bucket_storage(
Extent size, Allocator const& allocator, cuda::stream_ref stream)
: extent_{size},
allocator_{allocator},
slots_{allocator_.allocate(capacity(), stream),
slot_deleter_type{capacity(), allocator_, stream}}
: extent_{size}, allocator_{allocator}, slots_{[this, &stream]() {
constexpr std::size_t align = ref_type::alignment;
constexpr std::size_t extra = (align - 1) / sizeof(value_type) + 1;
std::size_t const alloc_size = static_cast<std::size_t>(capacity()) + extra;
auto* const raw_ptr = allocator_.allocate(alloc_size, stream);
auto* const aligned_ptr = reinterpret_cast<value_type*>(
(reinterpret_cast<cuda::std::uintptr_t>(raw_ptr) + align - 1) & ~(align - 1));
return std::unique_ptr<value_type, aligned_deleter>{
aligned_ptr, aligned_deleter{raw_ptr, alloc_size, allocator_, stream}};
}()}
{
}

Expand Down
109 changes: 108 additions & 1 deletion tests/utility/storage_test.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2025, NVIDIA CORPORATION.
* Copyright (c) 2022-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -21,8 +21,12 @@
#include <cuco/pair.cuh>
#include <cuco/utility/allocator.hpp>

#include <cuda/std/bit>

#include <catch2/catch_template_test_macros.hpp>

#include <cstdint>

TEMPLATE_TEST_CASE_SIG("utility storage tests",
"",
((typename Key, typename Value), Key, Value),
Expand Down Expand Up @@ -96,4 +100,107 @@ TEMPLATE_TEST_CASE_SIG("utility storage tests",
STATIC_REQUIRE(num_buckets == size / bucket_size);
STATIC_REQUIRE(capacity == gold_capacity);
}

SECTION("Storage alignment constant is correct for pairs.")
{
using storage_ref_type =
cuco::bucket_storage_ref<cuco::pair<Key, Value>, bucket_size, cuco::extent<std::size_t>>;
using bucket_type = typename storage_ref_type::bucket_type;

constexpr auto alignment = storage_ref_type::alignment;
constexpr auto expected_align =
cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), std::size_t{16});

STATIC_REQUIRE(alignment == expected_align);
STATIC_REQUIRE(cuda::std::has_single_bit(alignment));
}

SECTION("Storage alignment constant is correct for keys.")
{
using storage_ref_type = cuco::bucket_storage_ref<Key, bucket_size, cuco::extent<std::size_t>>;
using bucket_type = typename storage_ref_type::bucket_type;

constexpr auto alignment = storage_ref_type::alignment;
constexpr auto expected_align =
cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), std::size_t{16});

STATIC_REQUIRE(alignment == expected_align);
STATIC_REQUIRE(cuda::std::has_single_bit(alignment));
}

SECTION("Storage data pointer is aligned to bucket boundary for pairs.")
{
auto s = cuco::bucket_storage<cuco::pair<Key, Value>,
bucket_size,
cuco::extent<std::size_t>,
allocator_type>(
cuco::extent{size}, allocator, cuda::stream_ref{cudaStream_t{nullptr}});

auto const ptr = reinterpret_cast<std::uintptr_t>(s.data());
auto const alignment = decltype(s)::ref_type::alignment;

REQUIRE((ptr % alignment) == 0);
}

SECTION("Storage data pointer is aligned to bucket boundary for keys.")
{
auto s = cuco::bucket_storage<Key, bucket_size, cuco::extent<std::size_t>, allocator_type>(
cuco::extent{size}, allocator, cuda::stream_ref{cudaStream_t{nullptr}});

auto const ptr = reinterpret_cast<std::uintptr_t>(s.data());
auto const alignment = decltype(s)::ref_type::alignment;

REQUIRE((ptr % alignment) == 0);
}
}

TEMPLATE_TEST_CASE_SIG("bucket storage alignment with different bucket sizes",
"",
((typename T, int BucketSize), T, BucketSize),
(int32_t, 1),
(int32_t, 2),
(int32_t, 4),
(int64_t, 1),
(int64_t, 2),
(cuco::pair<int32_t, int32_t>, 1),
(cuco::pair<int32_t, int32_t>, 2),
(cuco::pair<int64_t, int64_t>, 1))
{
constexpr std::size_t size{1'000};

using allocator_type = cuco::cuda_allocator<char>;
using storage_type =
cuco::bucket_storage<T, BucketSize, cuco::extent<std::size_t>, allocator_type>;
using storage_ref_type = typename storage_type::ref_type;
using bucket_type = typename storage_ref_type::bucket_type;

auto allocator = allocator_type{};

SECTION("Alignment constant is power of 2 and capped at 16.")
{
constexpr auto alignment = storage_ref_type::alignment;

STATIC_REQUIRE(cuda::std::has_single_bit(alignment));
STATIC_REQUIRE(alignment <= 16);
STATIC_REQUIRE(alignment >= sizeof(T));
}

SECTION("Alignment matches expected value.")
{
constexpr auto alignment = storage_ref_type::alignment;
constexpr auto expected =
cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), std::size_t{16});

STATIC_REQUIRE(alignment == expected);
}

SECTION("Data pointer is aligned to bucket boundary.")
{
auto s = storage_type(cuco::extent{size}, allocator, cuda::stream_ref{cudaStream_t{nullptr}});

auto const ptr = reinterpret_cast<std::uintptr_t>(s.data());
auto const alignment = storage_ref_type::alignment;

REQUIRE((ptr % alignment) == 0);
}
}
Loading