Skip to content

Commit 5dc9937

Browse files
authored
[libc] Improve starting indices for GPU allocation (#150432)
Summary: The slots in this allocation scheme are statically allocated. All sizes share the same array of slots, but are given different starting locations to space them apart. The previous implementation used a trivial linear slice. This is inefficient because it provides the more likely allocations (1-1024 bytes) with just as much space as a highly unlikely one (1 MiB). This patch uses a cubic easing function to gradually shrink the gaps. For example, we used to get around 700 free slots for a 16 byte allocation, now we get around 2100 before it starts encroaching on the 32 byte allocation space. This could be improved further, but I think this is sufficient.
1 parent 9d642b0 commit 5dc9937

File tree

1 file changed

+13
-6
lines changed

1 file changed

+13
-6
lines changed

libc/src/__support/GPU/allocator.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -142,10 +142,16 @@ static inline constexpr bool is_pow2(uint64_t x) {
142142
return x && (x & (x - 1)) == 0;
143143
}
144144

145-
// Where this chunk size should start looking in the global array.
146-
static inline constexpr uint32_t start_index(uint32_t chunk_index) {
147-
return (ARRAY_SIZE * impl::get_chunk_id(chunk_index)) /
148-
impl::get_chunk_id(SLAB_SIZE / 2);
145+
// Where this chunk size should start looking in the global array. Small
146+
// allocations are much more likely than large ones, so we give them the most
147+
// space. We use a cubic easing function normalized on the possible chunks.
148+
static inline constexpr uint32_t start_index(uint32_t chunk_size) {
149+
constexpr uint32_t max_chunk = impl::get_chunk_id(SLAB_SIZE / 2);
150+
uint64_t norm =
151+
(1 << 16) - (impl::get_chunk_id(chunk_size) << 16) / max_chunk;
152+
uint64_t bias = (norm * norm * norm) >> 32;
153+
uint64_t inv = (1 << 16) - bias;
154+
return static_cast<uint32_t>(((ARRAY_SIZE - 1) * inv) >> 16);
149155
}
150156

151157
} // namespace impl
@@ -487,9 +493,10 @@ static Slab *find_slab(uint32_t chunk_size) {
487493
uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED);
488494
uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
489495

490-
for (uint32_t offset = 0; offset < ARRAY_SIZE; ++offset) {
496+
for (uint32_t offset = 0; offset <= ARRAY_SIZE; ++offset) {
491497
uint32_t index =
492-
!offset ? start : (impl::start_index(chunk_size) + offset) % ARRAY_SIZE;
498+
!offset ? start
499+
: (impl::start_index(chunk_size) + offset - 1) % ARRAY_SIZE;
493500

494501
if (slots[index].use_count() < Slab::available_chunks(chunk_size)) {
495502
uint64_t lane_mask = gpu::get_lane_mask();

0 commit comments

Comments
 (0)