Itoyori  v0.0.1
allocator.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <cstddef>
4 #include <cstdlib>
5 #include <cstring>
6 #include <atomic>
7 #include <sys/mman.h>
8 
9 #if ITYR_ALLOCATOR_USE_BOOST
10 #include <boost/container/pmr/memory_resource.hpp>
11 #include <boost/container/pmr/unsynchronized_pool_resource.hpp>
12 #include <boost/container/pmr/pool_options.hpp>
13 namespace ityr::common { namespace pmr = boost::container::pmr; }
14 #else
15 #include <memory_resource>
16 namespace ityr::common { namespace pmr = std::pmr; }
17 #endif
18 
19 #include "ityr/common/util.hpp"
20 #include "ityr/common/mpi_util.hpp"
21 #include "ityr/common/mpi_rma.hpp"
22 #include "ityr/common/topology.hpp"
25 #include "ityr/common/freelist.hpp"
26 #include "ityr/common/profiler.hpp"
28 #include "ityr/common/options.hpp"
29 
30 namespace ityr::common {
31 
33 
34 class mpi_win_resource final : public pmr::memory_resource {
35 public:
36  mpi_win_resource(void* base_addr,
37  std::size_t max_size,
38  MPI_Win win)
39  : win_(win),
40  freelist_(reinterpret_cast<uintptr_t>(base_addr), max_size) {}
41 
42  void* do_allocate(std::size_t bytes, std::size_t alignment) override {
43  if (alignment % get_page_size() != 0) {
44  die("[ityr::common::allocator] Requests for mpi_win_resource must be page-aligned");
45  }
46 
47  // Align with page size
48  std::size_t real_bytes = round_up_pow2(bytes, get_page_size());
49 
50  auto s = freelist_.get(real_bytes, alignment);
51  if (!s.has_value()) {
52  die("[ityr::common::allocator] Could not allocate memory for malloc_local()");
53  }
54 
55  void* ret = reinterpret_cast<void*>(*s);
56 
57  if constexpr (use_dynamic_win) {
58  MPI_Win_attach(win_, ret, real_bytes);
59  }
60 
61  return ret;
62  }
63 
64  void do_deallocate(void* p, std::size_t bytes, std::size_t alignment) override {
65  if (alignment % get_page_size() != 0) {
66  die("[ityr::common::allocator] Requests for mpi_win_resource must be page-aligned");
67  }
68 
69  // Align with page size
70  std::size_t real_bytes = round_up_pow2(bytes, get_page_size());
71 
72  if constexpr (use_dynamic_win) {
73  MPI_Win_detach(win_, p);
74 
75  if (madvise(p, real_bytes, MADV_REMOVE) == -1) {
76  perror("madvise");
77  die("[ityr::common::allocator] madvise() failed");
78  }
79  }
80 
81  freelist_.add(reinterpret_cast<uintptr_t>(p), real_bytes);
82  }
83 
84  bool do_is_equal(const pmr::memory_resource& other) const noexcept override {
85  return this == &other;
86  }
87 
88 private:
89  MPI_Win win_;
90  freelist freelist_;
91 };
92 
93 class block_resource final : public pmr::memory_resource {
94 public:
95  block_resource(pmr::memory_resource* upstream_mr,
96  std::size_t block_size)
97  : upstream_mr_(upstream_mr),
98  block_size_(block_size) {
100  }
101 
102  void* do_allocate(std::size_t bytes, std::size_t alignment) override {
103  if (bytes >= block_size_) {
104  return upstream_mr_->allocate(bytes, std::max(alignment, block_size_));
105  }
106 
107  auto s = freelist_.get(bytes, alignment);
108  if (!s.has_value()) {
109  void* new_block = upstream_mr_->allocate(block_size_, block_size_);
110  freelist_.add(reinterpret_cast<uintptr_t>(new_block), block_size_);
111  s = freelist_.get(bytes, alignment);
112  ITYR_CHECK(s.has_value());
113  }
114 
115  return reinterpret_cast<void*>(*s);
116  }
117 
118  void do_deallocate(void* p, std::size_t bytes, std::size_t alignment) override {
119  if (bytes >= block_size_) {
120  upstream_mr_->deallocate(p, bytes, std::max(alignment, block_size_));
121  return;
122  }
123 
124  freelist_.add(reinterpret_cast<uintptr_t>(p), bytes);
125 
126  // TODO: return allocated blocks to upstream
127  }
128 
129  bool do_is_equal(const pmr::memory_resource& other) const noexcept override {
130  return this == &other;
131  }
132 
133 private:
134  pmr::memory_resource* upstream_mr_;
135  std::size_t block_size_;
136  freelist freelist_;
137 };
138 
139 class remotable_resource final : public pmr::memory_resource {
140 public:
141  remotable_resource(std::size_t local_max_size)
142  : local_max_size_(calc_local_max_size(local_max_size)),
143  global_max_size_(local_max_size_ * topology::n_ranks()),
144  vm_(reserve_same_vm_coll(global_max_size_, local_max_size_)),
145  pm_(init_pm()),
146  local_base_addr_(reinterpret_cast<std::byte*>(vm_.addr()) + local_max_size_ * topology::my_rank()),
147  win_(create_win()),
148  win_mr_(local_base_addr_, local_max_size_, win()),
149  block_mr_(&win_mr_, allocator_block_size_option::value()),
150  std_pool_mr_(my_std_pool_options(), &block_mr_),
151  max_unflushed_free_objs_(allocator_max_unflushed_free_objs_option::value()),
152  allocated_size_(0),
153  collect_threshold_(std::size_t(16) * 1024),
154  collect_threshold_max_(local_max_size_ * 8 / 10) {}
155 
156  MPI_Win win() const { return win_.win(); }
157 
158  bool has(const void* p) const {
159  return vm_.addr() <= p && p < reinterpret_cast<std::byte*>(vm_.addr()) + global_max_size_;
160  }
161 
162  topology::rank_t get_owner(const void* p) const {
163  return (reinterpret_cast<uintptr_t>(p) - reinterpret_cast<uintptr_t>(vm_.addr())) / local_max_size_;
164  }
165 
166  std::size_t get_disp(const void* p) const {
167  if constexpr (use_dynamic_win) {
168  return reinterpret_cast<uintptr_t>(p);
169  } else {
170  return (reinterpret_cast<uintptr_t>(p) - reinterpret_cast<uintptr_t>(vm_.addr())) % local_max_size_;
171  }
172  }
173 
174  void* do_allocate(std::size_t bytes, std::size_t alignment = alignof(max_align_t)) override {
176 
177  std::size_t pad_bytes = round_up_pow2(sizeof(header), alignment);
178  std::size_t real_bytes = bytes + pad_bytes;
179 
180  if (allocated_size_ >= collect_threshold_) {
182  collect_threshold_ = allocated_size_ * 2;
183  if (collect_threshold_ > collect_threshold_max_) {
184  collect_threshold_ = (collect_threshold_max_ + allocated_size_) / 2;
185  }
186  }
187 
188  std::byte* p = reinterpret_cast<std::byte*>(std_pool_mr_.allocate(real_bytes, alignment));
189  std::byte* ret = p + pad_bytes;
190 
191  ITYR_CHECK(ret + bytes <= p + real_bytes);
192  ITYR_CHECK(p + sizeof(header) <= ret);
193 
194  header* h = new (p) header {
195  .prev = allocated_list_end_, .next = nullptr,
196  .size = real_bytes, .alignment = alignment, .freed = 0};
197  ITYR_CHECK(allocated_list_end_->next == nullptr);
198  allocated_list_end_->next = h;
199  allocated_list_end_ = h;
200 
201  allocated_size_ += real_bytes;
202 
203  return ret;
204  }
205 
206  void do_deallocate(void* p, std::size_t bytes, std::size_t alignment = alignof(max_align_t)) override {
207  auto target_rank = get_owner(p);
208  if (target_rank == topology::my_rank()) {
209  local_deallocate(p, bytes, alignment);
210  } else {
211  remote_deallocate(p, bytes, target_rank, alignment);
212  }
213  }
214 
215  bool do_is_equal(const pmr::memory_resource& other) const noexcept override {
216  return this == &other;
217  }
218 
219  void local_deallocate(void* p, std::size_t bytes, std::size_t alignment = alignof(max_align_t)) {
221 
223 
224  std::size_t pad_bytes = round_up_pow2(sizeof(header), alignment);
225  std::size_t real_bytes = bytes + pad_bytes;
226 
227  header* h = reinterpret_cast<header*>(reinterpret_cast<std::byte*>(p) - pad_bytes);
228  ITYR_CHECK(h->size == real_bytes);
229  ITYR_CHECK(h->alignment == alignment);
230  ITYR_CHECK(h->freed == 0);
231 
232  local_deallocate_impl(h, real_bytes, alignment);
233  }
234 
235  void remote_deallocate(void* p, std::size_t bytes [[maybe_unused]], int target_rank, std::size_t alignment = alignof(max_align_t)) {
237 
238  ITYR_CHECK(topology::my_rank() != target_rank);
239  ITYR_CHECK(get_owner(p) == target_rank);
240 
241  static constexpr int one = 1;
242  static int ret; // dummy value; passing NULL to result_addr causes segfault on some MPI
243  mpi_atomic_put_nb(&one, &ret, target_rank, get_header_disp(p, alignment), win());
244 
245  static int count = 0;
246  count++;
247  if (count >= max_unflushed_free_objs_) {
249  count = 0;
250  }
251  }
252 
255 
256  header *h = allocated_list_.next;
257  while (h) {
258  if (h->freed.load(std::memory_order_acquire)) {
259  header* h_next = h->next;
260  local_deallocate_impl(h, h->size, h->alignment);
261  h = h_next;
262  } else {
263  h = h->next;
264  }
265  }
266  }
267 
268  bool is_locally_accessible(const void* p) const {
270  }
271 
272  bool is_remotely_freed(void* p, std::size_t alignment = alignof(max_align_t)) {
274 
275  std::size_t pad_bytes = round_up_pow2(sizeof(header), alignment);
276  header* h = reinterpret_cast<header*>(reinterpret_cast<std::byte*>(p) - pad_bytes);
277 
278  if (h->freed.load(std::memory_order_acquire)) {
279  local_deallocate_impl(h, h->size, h->alignment);
280  return true;
281  }
282  return false;
283  }
284 
285  // mainly for debugging
286  bool empty() {
287  return allocated_list_.next == nullptr;
288  }
289 
290 private:
291  static std::string allocator_shmem_name(int inter_rank) {
292  static int count = 0;
293  std::stringstream ss;
294  ss << "/ityr_allocator_" << count++ << "_" << inter_rank;
295  return ss.str();
296  }
297 
298  std::size_t calc_local_max_size(std::size_t param) const {
299  if (param == 0) {
301  return (std::size_t(1) << 40) / next_pow2(topology::n_ranks());
302  } else {
303  return param;
304  }
305  }
306 
307  physical_mem init_pm() const {
308  physical_mem pm;
309 
310  if (topology::intra_my_rank() == 0) {
311  pm = physical_mem(allocator_shmem_name(topology::inter_my_rank()), global_max_size_, true);
312  }
313 
315 
316  if (topology::intra_my_rank() != 0) {
317  pm = physical_mem(allocator_shmem_name(topology::inter_my_rank()), global_max_size_, false);
318  }
319 
320  ITYR_CHECK(vm_.size() == global_max_size_);
321 
322  for (topology::rank_t r = 0; r < topology::intra_n_ranks(); r++) {
323  auto target_rank = topology::intra2global_rank(r);
324  auto offset = local_max_size_ * target_rank;
325  void* begin_addr = reinterpret_cast<std::byte*>(vm_.addr()) + offset;
326  pm.map_to_vm(begin_addr, local_max_size_, offset);
327  }
328 
329  return pm;
330  }
331 
332  mpi_win_manager<std::byte> create_win() const {
333  if constexpr (use_dynamic_win) {
334  return {topology::mpicomm()};
335  } else {
336  auto local_base_addr = reinterpret_cast<std::byte*>(vm_.addr()) + local_max_size_ * topology::my_rank();
337  return {topology::mpicomm(), local_base_addr, local_max_size_};
338  }
339  }
340 
341  // FIXME: workaround for boost
342  // Ideally: pmr::pool_options{.max_blocks_per_chunk = (std::size_t)16 * 1024 * 1024 * 1024}
343  pmr::pool_options my_std_pool_options() const {
344  pmr::pool_options opts;
345  opts.max_blocks_per_chunk = std::size_t(16) * 1024 * 1024 * 1024;
346  return opts;
347  }
348 
349  struct header {
350  header* prev = nullptr;
351  header* next = nullptr;
352  std::size_t size = 0;
353  std::size_t alignment = 0;
354  std::atomic<int> freed = 0;
355  };
356 
357  void remove_header_from_list(header* h) {
358  ITYR_CHECK(h->prev);
359  h->prev->next = h->next;
360 
361  if (h->next) {
362  h->next->prev = h->prev;
363  } else {
364  ITYR_CHECK(h == allocated_list_end_);
365  allocated_list_end_ = h->prev;
366  }
367  }
368 
369  std::size_t get_header_disp(const void* p, std::size_t alignment) const {
370  std::size_t pad_bytes = round_up_pow2(sizeof(header), alignment);
371  auto h = reinterpret_cast<const header*>(reinterpret_cast<const std::byte*>(p) - pad_bytes);
372  const void* flag_addr = &h->freed;
373 
374  return get_disp(flag_addr);
375  }
376 
377  void local_deallocate_impl(header* h, std::size_t size, std::size_t alignment) {
378  remove_header_from_list(h);
379  std::destroy_at(h);
380  std_pool_mr_.deallocate(h, size, alignment);
381 
382  ITYR_CHECK(allocated_size_ >= size);
383  allocated_size_ -= size;
384  }
385 
386  std::size_t local_max_size_;
387  std::size_t global_max_size_;
388  virtual_mem vm_;
389  physical_mem pm_;
390  void* local_base_addr_;
391  mpi_win_manager<std::byte> win_;
392  mpi_win_resource win_mr_;
393  block_resource block_mr_;
394  pmr::unsynchronized_pool_resource std_pool_mr_;
395  int max_unflushed_free_objs_;
396  header allocated_list_;
397  header* allocated_list_end_ = &allocated_list_;
398  std::size_t allocated_size_;
399  std::size_t collect_threshold_;
400  std::size_t collect_threshold_max_;
401 };
402 
403 template <typename T>
404 void remote_get(const remotable_resource& rmr, T* origin_p, const T* target_p, std::size_t size) {
405  if (rmr.is_locally_accessible(target_p)) {
406  std::memcpy(origin_p, target_p, size * sizeof(T));
407  } else {
408  auto target_rank = rmr.get_owner(target_p);
409  mpi_get(origin_p, size, target_rank, rmr.get_disp(target_p), rmr.win());
410  }
411 }
412 
413 template <typename T>
414 T remote_get_value(const remotable_resource& rmr, const T* target_p) {
415  if (rmr.is_locally_accessible(target_p)) {
416  return *target_p;
417  } else {
418  auto target_rank = rmr.get_owner(target_p);
419  return mpi_get_value<T>(target_rank, rmr.get_disp(target_p), rmr.win());
420  }
421 }
422 
423 template <typename T>
424 void remote_put(const remotable_resource& rmr, const T* origin_p, T* target_p, std::size_t size) {
425  if (rmr.is_locally_accessible(target_p)) {
426  std::memcpy(target_p, origin_p, size * sizeof(T));
427  } else {
428  auto target_rank = rmr.get_owner(target_p);
429  mpi_put(origin_p, size, target_rank, rmr.get_disp(target_p), rmr.win());
430  }
431 }
432 
433 template <typename T>
434 void remote_put_value(const remotable_resource& rmr, const T& val, T* target_p) {
435  if (rmr.is_locally_accessible(target_p)) {
436  *target_p = val;
437  } else {
438  auto target_rank = rmr.get_owner(target_p);
439  mpi_put_value(val, target_rank, rmr.get_disp(target_p), rmr.win());
440  }
441 }
442 
443 template <typename T>
444 T remote_faa_value(const remotable_resource& rmr, const T& val, T* target_p) {
445  auto target_rank = rmr.get_owner(target_p);
446  return mpi_atomic_faa_value(val, target_rank, rmr.get_disp(target_p), rmr.win());
447 }
448 
449 // Tests
450 // -----------------------------------------------------------------------------
451 
452 ITYR_TEST_CASE("[ityr::common::allocator] basic test") {
453  runtime_options opts;
454  singleton_initializer<topology::instance> topo;
455 
456  remotable_resource allocator(std::size_t(16) * 1024 * 1024);
457 
458  ITYR_SUBCASE("Local alloc/dealloc") {
459  std::vector<std::size_t> sizes = {1, 2, 4, 8, 16, 32, 100, 200, 1000, 100000, 1000000};
460  constexpr int N = 10;
461  for (auto size : sizes) {
462  void* ptrs[N];
463  for (int i = 0; i < N; i++) {
464  ptrs[i] = allocator.allocate(size);
465  for (std::size_t j = 0; j < size; j += 128) {
466  reinterpret_cast<char*>(ptrs[i])[j] = 0;
467  }
468  }
469  for (int i = 0; i < N; i++) {
470  allocator.deallocate(ptrs[i], size);
471  }
472  }
473  }
474 
475  ITYR_SUBCASE("Remote access") {
476  std::size_t size = 128;
477  void* p = allocator.allocate(size);
478 
479  for (std::size_t i = 0; i < size; i++) {
480  reinterpret_cast<uint8_t*>(p)[i] = topology::my_rank();
481  }
482 
483  std::vector<void*> addrs(topology::n_ranks());
484  addrs[topology::my_rank()] = p;
485 
486  // GET
487  for (int target_rank = 0; target_rank < topology::n_ranks(); target_rank++) {
488  addrs[target_rank] = mpi_bcast_value(addrs[target_rank], target_rank, topology::mpicomm());
489  if (topology::my_rank() != target_rank) {
490  std::vector<uint8_t> buf(size);
491  mpi_get_nb(buf.data(), size, target_rank, allocator.get_disp(addrs[target_rank]), allocator.win());
492  mpi_win_flush(target_rank, allocator.win());
493 
494  for (std::size_t i = 0; i < size; i++) {
495  ITYR_CHECK(buf[i] == target_rank);
496  }
497  }
499  }
500 
501  // PUT
502  std::vector<uint8_t> buf(size);
503  for (std::size_t i = 0; i < size; i++) {
504  buf[i] = topology::my_rank();
505  }
506 
507  int target_rank = (topology::my_rank() + 1) % topology::n_ranks();
508  mpi_put_nb(buf.data(), size, target_rank, allocator.get_disp(addrs[target_rank]), allocator.win());
509  mpi_win_flush_all(allocator.win());
510 
512 
513  for (std::size_t i = 0; i < size; i++) {
514  ITYR_CHECK(reinterpret_cast<uint8_t*>(p)[i] == (topology::n_ranks() + topology::my_rank() - 1) % topology::n_ranks());
515  }
516 
517  ITYR_SUBCASE("Local free") {
518  allocator.deallocate(p, size);
519  }
520 
521  if (topology::n_ranks() > 1) {
522  ITYR_SUBCASE("Remote free") {
523  ITYR_CHECK(!allocator.empty());
524 
526 
527  int target_rank = (topology::my_rank() + 1) % topology::n_ranks();
528  allocator.remote_deallocate(addrs[target_rank], size, target_rank);
529 
530  mpi_win_flush_all(allocator.win());
532 
533  allocator.collect_deallocated();
534  }
535  }
536 
537  ITYR_CHECK(allocator.empty());
538  }
539 }
540 
541 }
Definition: allocator.hpp:93
void * do_allocate(std::size_t bytes, std::size_t alignment) override
Definition: allocator.hpp:102
void do_deallocate(void *p, std::size_t bytes, std::size_t alignment) override
Definition: allocator.hpp:118
block_resource(pmr::memory_resource *upstream_mr, std::size_t block_size)
Definition: allocator.hpp:95
bool do_is_equal(const pmr::memory_resource &other) const noexcept override
Definition: allocator.hpp:129
Definition: freelist.hpp:13
void add(uintptr_t addr, std::size_t size)
Definition: freelist.hpp:68
std::optional< uintptr_t > get(std::size_t size)
Definition: freelist.hpp:18
MPI_Win win() const
Definition: mpi_rma.hpp:409
Definition: allocator.hpp:34
void do_deallocate(void *p, std::size_t bytes, std::size_t alignment) override
Definition: allocator.hpp:64
bool do_is_equal(const pmr::memory_resource &other) const noexcept override
Definition: allocator.hpp:84
void * do_allocate(std::size_t bytes, std::size_t alignment) override
Definition: allocator.hpp:42
mpi_win_resource(void *base_addr, std::size_t max_size, MPI_Win win)
Definition: allocator.hpp:36
Definition: allocator.hpp:139
bool has(const void *p) const
Definition: allocator.hpp:158
void collect_deallocated()
Definition: allocator.hpp:253
void local_deallocate(void *p, std::size_t bytes, std::size_t alignment=alignof(max_align_t))
Definition: allocator.hpp:219
void remote_deallocate(void *p, std::size_t bytes[[maybe_unused]], int target_rank, std::size_t alignment=alignof(max_align_t))
Definition: allocator.hpp:235
void * do_allocate(std::size_t bytes, std::size_t alignment=alignof(max_align_t)) override
Definition: allocator.hpp:174
bool is_remotely_freed(void *p, std::size_t alignment=alignof(max_align_t))
Definition: allocator.hpp:272
MPI_Win win() const
Definition: allocator.hpp:156
bool empty()
Definition: allocator.hpp:286
void do_deallocate(void *p, std::size_t bytes, std::size_t alignment=alignof(max_align_t)) override
Definition: allocator.hpp:206
topology::rank_t get_owner(const void *p) const
Definition: allocator.hpp:162
bool do_is_equal(const pmr::memory_resource &other) const noexcept override
Definition: allocator.hpp:215
bool is_locally_accessible(const void *p) const
Definition: allocator.hpp:268
remotable_resource(std::size_t local_max_size)
Definition: allocator.hpp:141
std::size_t get_disp(const void *p) const
Definition: allocator.hpp:166
void * addr() const
Definition: virtual_mem.hpp:46
std::size_t size() const
Definition: virtual_mem.hpp:47
#define ITYR_ALLOCATOR_USE_DYNAMIC_WIN
#define ITYR_SUBCASE(name)
Definition: util.hpp:41
#define ITYR_CHECK(cond)
Definition: util.hpp:48
ITYR_RMA_IMPL::win win
Definition: rma.hpp:13
rank_t inter_my_rank()
Definition: topology.hpp:215
rank_t n_ranks()
Definition: topology.hpp:208
int rank_t
Definition: topology.hpp:12
rank_t inter_rank(rank_t global_rank)
Definition: topology.hpp:219
MPI_Comm mpicomm()
Definition: topology.hpp:206
rank_t intra_my_rank()
Definition: topology.hpp:211
bool is_locally_accessible(rank_t target_global_rank)
Definition: topology.hpp:224
MPI_Comm intra_mpicomm()
Definition: topology.hpp:210
rank_t intra_n_ranks()
Definition: topology.hpp:212
rank_t my_rank()
Definition: topology.hpp:207
rank_t intra2global_rank(rank_t intra_rank)
Definition: topology.hpp:221
Definition: allocator.hpp:16
T round_up_pow2(T x, T alignment)
Definition: util.hpp:142
void remote_get(const remotable_resource &rmr, T *origin_p, const T *target_p, std::size_t size)
Definition: allocator.hpp:404
void remote_put_value(const remotable_resource &rmr, const T &val, T *target_p)
Definition: allocator.hpp:434
bool is_pow2(T x)
Definition: util.hpp:125
T remote_faa_value(const remotable_resource &rmr, const T &val, T *target_p)
Definition: allocator.hpp:444
T mpi_bcast_value(const T &value, int root_rank, MPI_Comm comm)
Definition: mpi_util.hpp:145
void mpi_win_flush_all(MPI_Win win)
Definition: mpi_rma.hpp:31
void mpi_get(T *origin, std::size_t count, int target_rank, std::size_t target_disp, MPI_Win win)
Definition: mpi_rma.hpp:69
std::size_t get_page_size()
Definition: util.hpp:170
void mpi_get_nb(T *origin, std::size_t count, int target_rank, std::size_t target_disp, MPI_Win win)
Definition: mpi_rma.hpp:48
void remote_put(const remotable_resource &rmr, const T *origin_p, T *target_p, std::size_t size)
Definition: allocator.hpp:424
void mpi_put_nb(const T *origin, std::size_t count, int target_rank, std::size_t target_disp, MPI_Win win)
Definition: mpi_rma.hpp:111
T remote_get_value(const remotable_resource &rmr, const T *target_p)
Definition: allocator.hpp:414
virtual_mem reserve_same_vm_coll(std::size_t size, std::size_t alignment=alignof(max_align_t))
Definition: virtual_mem.hpp:170
void mpi_put(const T *origin, std::size_t count, int target_rank, std::size_t target_disp, MPI_Win win)
Definition: mpi_rma.hpp:132
void mpi_win_flush(int target_rank, MPI_Win win)
Definition: mpi_rma.hpp:15
constexpr auto size(const span< T > &s) noexcept
Definition: span.hpp:61
void mpi_put_value(const T &value, int target_rank, std::size_t target_disp, MPI_Win win)
Definition: mpi_rma.hpp:165
void mpi_atomic_put_nb(const T *origin, T *result, int target_rank, std::size_t target_disp, MPI_Win win)
Definition: mpi_rma.hpp:263
void mpi_barrier(MPI_Comm comm)
Definition: mpi_util.hpp:42
uint64_t next_pow2(uint64_t x)
Definition: util.hpp:102
constexpr bool use_dynamic_win
Definition: allocator.hpp:32
T mpi_atomic_faa_value(const T &value, int target_rank, std::size_t target_disp, MPI_Win win)
Definition: mpi_rma.hpp:193
constexpr block_size_t block_size
Definition: ori.hpp:19
monoid< T, max_functor<>, lowest< T > > max
Definition: reducer.hpp:104
rank_t my_rank()
Return the rank of the process running the current thread.
Definition: ityr.hpp:99
rank_t n_ranks()
Return the total number of processes.
Definition: ityr.hpp:107
#define ITYR_PROFILER_RECORD(event,...)
Definition: profiler.hpp:319
Definition: prof_events.hpp:104
Definition: prof_events.hpp:119
Definition: prof_events.hpp:109
Definition: prof_events.hpp:114