thrust
pool.h
Go to the documentation of this file.
1 /*
2  * Copyright 2018 NVIDIA Corporation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
22 #pragma once
23 
24 #include <algorithm>
25 
26 #include <thrust/host_vector.h>
27 
29 #include <thrust/mr/allocator.h>
30 #include <thrust/mr/pool_options.h>
31 
32 #include <cassert>
33 
34 namespace thrust
35 {
36 namespace mr
37 {
38 
65 template<typename Upstream>
66 class unsynchronized_pool_resource THRUST_FINAL
67  : public memory_resource<typename Upstream::pointer>,
68  private validator<Upstream>
69 {
70 public:
76  {
77  pool_options ret;
78 
79  ret.min_blocks_per_chunk = 16;
80  ret.min_bytes_per_chunk = 1024;
81  ret.max_blocks_per_chunk = static_cast<std::size_t>(1) << 20;
82  ret.max_bytes_per_chunk = static_cast<std::size_t>(1) << 30;
83 
84  ret.smallest_block_size = THRUST_MR_DEFAULT_ALIGNMENT;
85  ret.largest_block_size = static_cast<std::size_t>(1) << 20;
86 
87  ret.alignment = THRUST_MR_DEFAULT_ALIGNMENT;
88 
89  ret.cache_oversized = true;
90 
93 
94  return ret;
95  }
96 
103  : m_upstream(upstream),
104  m_options(options),
105  m_smallest_block_log2(detail::log2_ri(m_options.smallest_block_size)),
106  m_pools(upstream),
107  m_allocated(),
108  m_oversized(),
109  m_cached_oversized()
110  {
111  assert(m_options.validate());
112 
113  pool p = { block_descriptor_ptr(), 0 };
114  m_pools.resize(detail::log2_ri(m_options.largest_block_size) - m_smallest_block_log2 + 1, p);
115  }
116 
117  // TODO: C++11: use delegating constructors
118 
124  : m_upstream(get_global_resource<Upstream>()),
125  m_options(options),
126  m_smallest_block_log2(detail::log2_ri(m_options.smallest_block_size)),
127  m_pools(get_global_resource<Upstream>()),
128  m_allocated(),
129  m_oversized(),
130  m_cached_oversized()
131  {
132  assert(m_options.validate());
133 
134  pool p = { block_descriptor_ptr(), 0 };
135  m_pools.resize(detail::log2_ri(m_options.largest_block_size) - m_smallest_block_log2 + 1, p);
136  }
137 
141  {
142  release();
143  }
144 
145 private:
146  typedef typename Upstream::pointer void_ptr;
147  typedef typename thrust::detail::pointer_traits<void_ptr>::template rebind<char>::other char_ptr;
148 
149  struct block_descriptor;
150  struct chunk_descriptor;
151  struct oversized_block_descriptor;
152 
153  typedef typename thrust::detail::pointer_traits<void_ptr>::template rebind<block_descriptor>::other block_descriptor_ptr;
154  typedef typename thrust::detail::pointer_traits<void_ptr>::template rebind<chunk_descriptor>::other chunk_descriptor_ptr;
155  typedef typename thrust::detail::pointer_traits<void_ptr>::template rebind<oversized_block_descriptor>::other oversized_block_descriptor_ptr;
156 
157  struct block_descriptor
158  {
159  block_descriptor_ptr next;
160  };
161 
162  struct chunk_descriptor
163  {
164  std::size_t size;
165  chunk_descriptor_ptr next;
166  };
167 
168  // this was originally a forward list, but I made it a doubly linked list
169  // because that way deallocation when not caching is faster and doesn't require
170  // traversal of a linked list (it's still a forward list for the cached list,
171  // because allocation from that list already traverses)
172  //
173  // TODO: investigate whether it's better to have this be a doubly-linked list
174  // with fast do_deallocate when !m_options.cache_oversized, or to have this be
175  // a forward list and require traversal in do_deallocate
176  //
177  // I assume that it is better this way, but the additional pointer could
178  // potentially hurt? these are supposed to be oversized and/or overaligned,
179  // so they are kinda memory intensive already
180  struct oversized_block_descriptor
181  {
182  std::size_t size;
183  std::size_t alignment;
184  oversized_block_descriptor_ptr prev;
185  oversized_block_descriptor_ptr next;
186  oversized_block_descriptor_ptr next_cached;
187  };
188 
189  struct pool
190  {
191  block_descriptor_ptr free_list;
192  std::size_t previous_allocated_count;
193  };
194 
195  typedef thrust::host_vector<
196  pool,
198  > pool_vector;
199 
200  Upstream * m_upstream;
201 
202  pool_options m_options;
203  std::size_t m_smallest_block_log2;
204 
205  pool_vector m_pools;
206  chunk_descriptor_ptr m_allocated;
207  oversized_block_descriptor_ptr m_oversized;
208  oversized_block_descriptor_ptr m_cached_oversized;
209 
210 public:
213  void release()
214  {
215  // reset the buckets
216  for (std::size_t i = 0; i < m_pools.size(); ++i)
217  {
218  thrust::raw_reference_cast(m_pools[i]).free_list = block_descriptor_ptr();
219  thrust::raw_reference_cast(m_pools[i]).previous_allocated_count = 0;
220  }
221 
222  // deallocate memory allocated for the buckets
224  {
225  chunk_descriptor_ptr alloc = m_allocated;
226  m_allocated = thrust::raw_reference_cast(*m_allocated).next;
227 
228  void_ptr p = static_cast<void_ptr>(
229  static_cast<char_ptr>(
230  static_cast<void_ptr>(alloc)
231  ) - thrust::raw_reference_cast(*alloc).size
232  );
233  m_upstream->do_deallocate(p, thrust::raw_reference_cast(*alloc).size + sizeof(chunk_descriptor), m_options.alignment);
234  }
235 
236  // deallocate cached oversized/overaligned memory
238  {
239  oversized_block_descriptor_ptr alloc = m_oversized;
240  m_oversized = thrust::raw_reference_cast(*m_oversized).next;
241 
242  void_ptr p = static_cast<void_ptr>(
243  static_cast<char_ptr>(
244  static_cast<void_ptr>(alloc)
245  ) - thrust::raw_reference_cast(*alloc).size
246  );
247  m_upstream->do_deallocate(p, thrust::raw_reference_cast(*alloc).size + sizeof(oversized_block_descriptor), thrust::raw_reference_cast(*alloc).alignment);
248  }
249 
250  m_cached_oversized = oversized_block_descriptor_ptr();
251  }
252 
253  THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
254  {
255  bytes = (std::max)(bytes, m_options.smallest_block_size);
256  assert(detail::is_power_of_2(alignment));
257 
258  // an oversized and/or overaligned allocation requested; needs to be allocated separately
259  if (bytes > m_options.largest_block_size || alignment > m_options.alignment)
260  {
261  if (m_options.cache_oversized)
262  {
263  oversized_block_descriptor_ptr ptr = m_cached_oversized;
264  oversized_block_descriptor_ptr * previous = &m_cached_oversized;
266  {
267  oversized_block_descriptor desc = *ptr;
268  bool is_good = desc.size >= bytes && desc.alignment >= alignment;
269 
270  // if the size is bigger than the requested size by a factor
271  // bigger than or equal to the specified cutoff for size,
272  // allocate a new block
273  if (is_good)
274  {
275  std::size_t size_factor = desc.size / bytes;
276  if (size_factor >= m_options.cached_size_cutoff_factor)
277  {
278  is_good = false;
279  }
280  }
281 
282  // if the alignment is bigger than the requested one by a factor
283  // bigger than or equal to the specified cutoff for alignment,
284  // allocate a new block
285  if (is_good)
286  {
287  std::size_t alignment_factor = desc.alignment / alignment;
288  if (alignment_factor >= m_options.cached_alignment_cutoff_factor)
289  {
290  is_good = false;
291  }
292  }
293 
294  if (is_good)
295  {
296  if (previous != &m_cached_oversized)
297  {
298  oversized_block_descriptor previous_desc = **previous;
299  previous_desc.next_cached = desc.next_cached;
300  **previous = previous_desc;
301  }
302  else
303  {
304  m_cached_oversized = desc.next_cached;
305  }
306 
307  desc.next_cached = oversized_block_descriptor_ptr();
308  *ptr = desc;
309 
310  return static_cast<void_ptr>(
311  static_cast<char_ptr>(
312  static_cast<void_ptr>(ptr)
313  ) - desc.size
314  );
315  }
316 
317  previous = &thrust::raw_reference_cast(*ptr).next_cached;
318  ptr = *previous;
319  }
320  }
321 
322  // no fitting cached block found; allocate a new one that's just up to the specs
323  void_ptr allocated = m_upstream->do_allocate(bytes + sizeof(oversized_block_descriptor), alignment);
324  oversized_block_descriptor_ptr block = static_cast<oversized_block_descriptor_ptr>(
325  static_cast<void_ptr>(
326  static_cast<char_ptr>(allocated) + bytes
327  )
328  );
329 
330  oversized_block_descriptor desc;
331  desc.size = bytes;
332  desc.alignment = alignment;
333  desc.prev = oversized_block_descriptor_ptr();
334  desc.next = m_oversized;
335  desc.next_cached = oversized_block_descriptor_ptr();
336  *block = desc;
337  m_oversized = block;
338 
340  {
341  oversized_block_descriptor next = *desc.next;
342  next.prev = block;
343  *desc.next = next;
344  }
345 
346  return allocated;
347  }
348 
349  // the request is NOT for oversized and/or overaligned memory
350  // allocate a block from an appropriate bucket
351  std::size_t bytes_log2 = thrust::detail::log2_ri(bytes);
352  std::size_t bucket_idx = bytes_log2 - m_smallest_block_log2;
353  pool & bucket = thrust::raw_reference_cast(m_pools[bucket_idx]);
354 
355  bytes = static_cast<std::size_t>(1) << bytes_log2;
356 
357  // if the free list of the bucket has no elements, allocate a new chunk
358  // and split it into blocks pushed to the free list
360  {
361  std::size_t n = bucket.previous_allocated_count;
362  if (n == 0)
363  {
364  n = m_options.min_blocks_per_chunk;
365  if (n < (m_options.min_bytes_per_chunk >> bytes_log2))
366  {
367  n = m_options.min_bytes_per_chunk >> bytes_log2;
368  }
369  }
370  else
371  {
372  n = n * 3 / 2;
373  if (n > (m_options.max_bytes_per_chunk >> bytes_log2))
374  {
375  n = m_options.max_bytes_per_chunk >> bytes_log2;
376  }
377  if (n > m_options.max_blocks_per_chunk)
378  {
379  n = m_options.max_blocks_per_chunk;
380  }
381  }
382 
383  std::size_t descriptor_size = (std::max)(sizeof(block_descriptor), m_options.alignment);
384  std::size_t block_size = bytes + descriptor_size;
385  block_size += m_options.alignment - block_size % m_options.alignment;
386  std::size_t chunk_size = block_size * n;
387 
388  void_ptr allocated = m_upstream->do_allocate(chunk_size + sizeof(chunk_descriptor), m_options.alignment);
389  chunk_descriptor_ptr chunk = static_cast<chunk_descriptor_ptr>(
390  static_cast<void_ptr>(
391  static_cast<char_ptr>(allocated) + chunk_size
392  )
393  );
394 
395  chunk_descriptor desc;
396  desc.size = chunk_size;
397  desc.next = m_allocated;
398  *chunk = desc;
399  m_allocated = chunk;
400 
401  for (std::size_t i = 0; i < n; ++i)
402  {
403  block_descriptor_ptr block = static_cast<block_descriptor_ptr>(
404  static_cast<void_ptr>(
405  static_cast<char_ptr>(allocated) + block_size * i + bytes
406  )
407  );
408 
409  block_descriptor desc;
410  desc.next = bucket.free_list;
411  *block = desc;
412  bucket.free_list = block;
413  }
414  }
415 
416  // allocate a block from the front of the bucket's free list
417  block_descriptor_ptr block = bucket.free_list;
418  bucket.free_list = thrust::raw_reference_cast(*block).next;
419  return static_cast<void_ptr>(
420  static_cast<char_ptr>(
421  static_cast<void_ptr>(block)
422  ) - bytes
423  );
424  }
425 
426  virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
427  {
428  n = (std::max)(n, m_options.smallest_block_size);
429  assert(detail::is_power_of_2(alignment));
430 
431  // verify that the pointer is at least as aligned as claimed
432  assert(reinterpret_cast<detail::intmax_t>(detail::pointer_traits<void_ptr>::get(p)) % alignment == 0);
433 
434  // the deallocated block is oversized and/or overaligned
435  if (n > m_options.largest_block_size || alignment > m_options.alignment)
436  {
437  oversized_block_descriptor_ptr block = static_cast<oversized_block_descriptor_ptr>(
438  static_cast<void_ptr>(
439  static_cast<char_ptr>(p) + n
440  )
441  );
442 
443  oversized_block_descriptor desc = *block;
444 
445  if (m_options.cache_oversized)
446  {
447  desc.next_cached = m_cached_oversized;
448  *block = desc;
449  m_cached_oversized = block;
450 
451  return;
452  }
453 
455  {
456  assert(m_oversized == block);
457  m_oversized = desc.next;
458  }
459  else
460  {
461  oversized_block_descriptor prev = *desc.prev;
462  assert(prev.next == block);
463  prev.next = desc.next;
464  *desc.prev = prev;
465  }
466 
468  {
469  oversized_block_descriptor next = *desc.next;
470  assert(next.prev == block);
471  next.prev = desc.prev;
472  *desc.next = next;
473  }
474 
475  m_upstream->do_deallocate(p, desc.size + sizeof(oversized_block_descriptor), desc.alignment);
476 
477  return;
478  }
479 
480  // push the block to the front of the appropriate bucket's free list
481  std::size_t n_log2 = thrust::detail::log2_ri(n);
482  std::size_t bucket_idx = n_log2 - m_smallest_block_log2;
483  pool & bucket = thrust::raw_reference_cast(m_pools[bucket_idx]);
484 
485  n = static_cast<std::size_t>(1) << n_log2;
486 
487  block_descriptor_ptr block = static_cast<block_descriptor_ptr>(
488  static_cast<void_ptr>(
489  static_cast<char_ptr>(p) + n
490  )
491  );
492 
493  block_descriptor desc;
494  desc.next = bucket.free_list;
495  *block = desc;
496  bucket.free_list = block;
497  }
498 };
499 
503 } // end mr
504 } // end thrust
505 
bool validate() const
Definition: pool_options.h:96
void release()
Definition: pool.h:213
__host__ __device__ access_traits< typename tuple_element< N, detail::cons< HT, TT > >::type >::non_const_type get(detail::cons< HT, TT > &t)
pool_options is a type used by the pooling resource adaptors to fine-tune their behavior.
unsynchronized_pool_resource(Upstream *upstream, pool_options options=get_default_options())
Definition: pool.h:102
Definition: pool_options.h:41
std::size_t max_bytes_per_chunk
Definition: pool_options.h:56
std::size_t cached_alignment_cutoff_factor
Definition: pool_options.h:90
unsynchronized_pool_resource(pool_options options=get_default_options())
Definition: pool.h:123
std::size_t min_bytes_per_chunk
Definition: pool_options.h:49
__host__ __device__ detail::raw_reference< T >::type raw_reference_cast(T &ref)
static pool_options get_default_options()
Definition: pool.h:75
~unsynchronized_pool_resource()
Definition: pool.h:140
__host__ MR * get_global_resource()
Definition: memory_resource.h:206
thrust is the top-level namespace which contains all Thrust functions and types.
Definition: addressof.h:14
std::size_t largest_block_size
Definition: pool_options.h:66
std::size_t cached_size_cutoff_factor
Definition: pool_options.h:84
std::size_t smallest_block_size
Definition: pool_options.h:61
virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment=alignof(max_align_t))
Definition: pool.h:426
bool cache_oversized
Definition: pool_options.h:77
virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment=alignof(max_align_t))
Definition: pool.h:253
Definition: device_vector.h:35
Definition: allocator.h:52
A base class for the memory resource system, similar to std::memory_resource, and related utilities...
Definition: memory_resource.h:48
Allocator types usable with NPA-based memory resources.
A dynamically-sizable array of elements which reside in the "host" memory space.
std::size_t max_blocks_per_chunk
Definition: pool_options.h:53
std::size_t alignment
Definition: pool_options.h:72
std::size_t min_blocks_per_chunk
Definition: pool_options.h:46