RMM  23.12
RAPIDS Memory Manager
arena_memory_resource.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2022, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <rmm/detail/error.hpp>
19 #include <rmm/detail/logging_assert.hpp>
20 #include <rmm/logger.hpp>
21 #include <rmm/mr/device/detail/arena.hpp>
23 
24 #include <cuda_runtime_api.h>
25 
26 #include <spdlog/common.h>
27 
28 #include <cstddef>
29 #include <map>
30 #include <shared_mutex>
31 #include <thread>
32 
33 namespace rmm::mr {
79 template <typename Upstream>
81  public:
92  explicit arena_memory_resource(Upstream* upstream_mr,
93  std::optional<std::size_t> arena_size = std::nullopt,
94  bool dump_log_on_failure = false)
95  : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
96  {
97  if (dump_log_on_failure_) {
98  logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
99  // Set the level to `debug` for more detailed output.
100  logger_->set_level(spdlog::level::info);
101  }
102  }
103 
104  ~arena_memory_resource() override = default;
105 
106  // Disable copy (and move) semantics.
108  arena_memory_resource& operator=(arena_memory_resource const&) = delete;
109  arena_memory_resource(arena_memory_resource&&) noexcept = delete;
110  arena_memory_resource& operator=(arena_memory_resource&&) noexcept = delete;
111 
118  bool supports_streams() const noexcept override { return true; }
119 
125  bool supports_get_mem_info() const noexcept override { return false; }
126 
127  private:
128  using global_arena = rmm::mr::detail::arena::global_arena<Upstream>;
129  using arena = rmm::mr::detail::arena::arena<Upstream>;
130 
142  void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
143  {
144  if (bytes <= 0) { return nullptr; }
145 #ifdef RMM_ARENA_USE_SIZE_CLASSES
146  bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
147 #else
148  bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
149 #endif
150  auto& arena = get_arena(stream);
151 
152  {
153  std::shared_lock lock(mtx_);
154  void* pointer = arena.allocate(bytes);
155  if (pointer != nullptr) { return pointer; }
156  }
157 
158  {
159  std::unique_lock lock(mtx_);
160  defragment();
161  void* pointer = arena.allocate(bytes);
162  if (pointer == nullptr) {
163  if (dump_log_on_failure_) { dump_memory_log(bytes); }
164  RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
165  }
166  return pointer;
167  }
168  }
169 
173  void defragment()
174  {
175  RMM_CUDA_TRY(cudaDeviceSynchronize());
176  for (auto& thread_arena : thread_arenas_) {
177  thread_arena.second->clean();
178  }
179  for (auto& stream_arena : stream_arenas_) {
180  stream_arena.second.clean();
181  }
182  }
183 
192  void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
193  {
194  if (ptr == nullptr || bytes <= 0) { return; }
195 #ifdef RMM_ARENA_USE_SIZE_CLASSES
196  bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
197 #else
198  bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
199 #endif
200  auto& arena = get_arena(stream);
201 
202  {
203  std::shared_lock lock(mtx_);
204  // If the memory being freed does not belong to the arena, the following will return false.
205  if (arena.deallocate(ptr, bytes, stream)) { return; }
206  }
207 
208  {
209  // Since we are returning this memory to another stream, we need to make sure the current
210  // stream is caught up.
211  stream.synchronize_no_throw();
212 
213  std::unique_lock lock(mtx_);
214  deallocate_from_other_arena(ptr, bytes, stream);
215  }
216  }
217 
226  void deallocate_from_other_arena(void* ptr, std::size_t bytes, cuda_stream_view stream)
227  {
228  if (use_per_thread_arena(stream)) {
229  for (auto const& thread_arena : thread_arenas_) {
230  if (thread_arena.second->deallocate(ptr, bytes)) { return; }
231  }
232  } else {
233  for (auto& stream_arena : stream_arenas_) {
234  if (stream_arena.second.deallocate(ptr, bytes)) { return; }
235  }
236  }
237 
238  if (!global_arena_.deallocate(ptr, bytes)) {
239  // It's possible to use per thread default streams along with another pool of streams.
240  // This means that it's possible for an allocation to move from a thread or stream arena
241  // back into the global arena during a defragmentation and then move down into another arena
242  // type. For instance, thread arena -> global arena -> stream arena. If this happens and
243  // there was an allocation from it while it was a thread arena, we now have to check to
244  // see if the allocation is part of a stream arena, and vice versa.
245  // Only do this in exceptional cases to not affect performance and have to check all
246  // arenas all the time.
247  if (use_per_thread_arena(stream)) {
248  for (auto& stream_arena : stream_arenas_) {
249  if (stream_arena.second.deallocate(ptr, bytes)) { return; }
250  }
251  } else {
252  for (auto const& thread_arena : thread_arenas_) {
253  if (thread_arena.second->deallocate(ptr, bytes)) { return; }
254  }
255  }
256  RMM_FAIL("allocation not found");
257  }
258  }
259 
266  arena& get_arena(cuda_stream_view stream)
267  {
268  if (use_per_thread_arena(stream)) { return get_thread_arena(); }
269  return get_stream_arena(stream);
270  }
271 
277  arena& get_thread_arena()
278  {
279  auto const thread_id = std::this_thread::get_id();
280  {
281  std::shared_lock lock(map_mtx_);
282  auto const iter = thread_arenas_.find(thread_id);
283  if (iter != thread_arenas_.end()) { return *iter->second; }
284  }
285  {
286  std::unique_lock lock(map_mtx_);
287  auto thread_arena = std::make_shared<arena>(global_arena_);
288  thread_arenas_.emplace(thread_id, thread_arena);
289  thread_local detail::arena::arena_cleaner<Upstream> cleaner{thread_arena};
290  return *thread_arena;
291  }
292  }
293 
299  arena& get_stream_arena(cuda_stream_view stream)
300  {
301  RMM_LOGGING_ASSERT(!use_per_thread_arena(stream));
302  {
303  std::shared_lock lock(map_mtx_);
304  auto const iter = stream_arenas_.find(stream.value());
305  if (iter != stream_arenas_.end()) { return iter->second; }
306  }
307  {
308  std::unique_lock lock(map_mtx_);
309  stream_arenas_.emplace(stream.value(), global_arena_);
310  return stream_arenas_.at(stream.value());
311  }
312  }
313 
320  std::pair<std::size_t, std::size_t> do_get_mem_info(
321  [[maybe_unused]] cuda_stream_view stream) const override
322  {
323  return std::make_pair(0, 0);
324  }
325 
331  void dump_memory_log(size_t bytes)
332  {
333  logger_->info("**************************************************");
334  logger_->info("Ran out of memory trying to allocate {}.", rmm::detail::bytes{bytes});
335  logger_->info("**************************************************");
336  logger_->info("Global arena:");
337  global_arena_.dump_memory_log(logger_);
338  logger_->flush();
339  }
340 
347  static bool use_per_thread_arena(cuda_stream_view stream)
348  {
349  return stream.is_per_thread_default();
350  }
351 
353  global_arena global_arena_;
356  std::map<std::thread::id, std::shared_ptr<arena>> thread_arenas_;
359  std::map<cudaStream_t, arena> stream_arenas_;
361  bool dump_log_on_failure_{};
363  std::shared_ptr<spdlog::logger> logger_{};
365  mutable std::shared_mutex map_mtx_;
367  mutable std::shared_mutex mtx_;
368 };
369  // end of group
371 } // namespace rmm::mr
Strongly-typed non-owning wrapper for CUDA streams with default constructor.
Definition: cuda_stream_view.hpp:41
A suballocator that emphasizes fragmentation avoidance and scalable concurrency support.
Definition: arena_memory_resource.hpp:80
arena_memory_resource(Upstream *upstream_mr, std::optional< std::size_t > arena_size=std::nullopt, bool dump_log_on_failure=false)
Construct an arena_memory_resource.
Definition: arena_memory_resource.hpp:92
bool supports_get_mem_info() const noexcept override
Query whether the resource supports the get_mem_info API.
Definition: arena_memory_resource.hpp:125
bool supports_streams() const noexcept override
Queries whether the resource supports use of non-null CUDA streams for allocation/deallocation.
Definition: arena_memory_resource.hpp:118
Base class for all libcudf device memory allocation.
Definition: device_memory_resource.hpp:89
Exception thrown when RMM runs out of memory.
Definition: error.hpp:89
Represent a size in number of bytes.
Definition: logger.hpp:74