Itoyori  v0.0.1
core.hpp
Go to the documentation of this file.
1 #pragma once
2 
3 #include <optional>
4 #include <algorithm>
5 
6 #include "ityr/common/util.hpp"
9 #include "ityr/common/logger.hpp"
10 #include "ityr/common/rma.hpp"
12 #include "ityr/ori/util.hpp"
13 #include "ityr/ori/options.hpp"
14 #include "ityr/ori/prof_events.hpp"
15 #include "ityr/ori/coll_mem.hpp"
17 #include "ityr/ori/noncoll_mem.hpp"
20 
21 namespace ityr::ori::core {
22 
23 template <block_size_t BlockSize, typename Fn>
24 void for_each_block(void* addr, std::size_t size, Fn fn) {
25  std::byte* blk_addr_b = common::round_down_pow2(reinterpret_cast<std::byte*>(addr), BlockSize);
26  std::byte* blk_addr_e = reinterpret_cast<std::byte*>(addr) + size;
27 
28  for (std::byte* blk_addr = blk_addr_b; blk_addr < blk_addr_e; blk_addr += BlockSize) {
29  std::byte* req_addr_b = std::max(reinterpret_cast<std::byte*>(addr), blk_addr);
30  std::byte* req_addr_e = std::min(reinterpret_cast<std::byte*>(addr) + size, blk_addr + BlockSize);
31  fn(blk_addr, req_addr_b, req_addr_e);
32  }
33 }
34 
35 template <block_size_t BlockSize, typename HomeSegFn, typename CacheBlkFn>
36 void for_each_seg_blk(const coll_mem& cm, void* addr, std::size_t size,
37  HomeSegFn home_seg_fn, CacheBlkFn cache_blk_fn) {
38  for_each_mem_segment(cm, addr, size, [&](const auto& seg) {
39  std::byte* seg_addr = reinterpret_cast<std::byte*>(cm.vm().addr()) + seg.offset_b;
40  std::size_t seg_size = seg.offset_e - seg.offset_b;
41 
42  if (seg.owner == common::topology::inter_my_rank()) {
43  // no need to iterate over memory blocks (of BlockSize) for home segments
44  home_seg_fn(seg_addr, seg_size, seg.pm_offset);
45 
46  } else {
47  // iterate over memory blocks within the memory segment for cache blocks
48  std::byte* addr_b = std::max(seg_addr, reinterpret_cast<std::byte*>(addr));
49  std::byte* addr_e = std::min(seg_addr + seg_size, reinterpret_cast<std::byte*>(addr) + size);
50  for_each_block<BlockSize>(addr_b, addr_e - addr_b, [&](std::byte* blk_addr,
51  std::byte* req_addr_b,
52  std::byte* req_addr_e) {
53  std::size_t pm_offset = seg.pm_offset + (blk_addr - seg_addr);
54  ITYR_CHECK(pm_offset + BlockSize <= cm.mem_mapper().local_size(seg.owner));
55  cache_blk_fn(blk_addr, req_addr_b, req_addr_e, seg.owner, pm_offset);
56  });
57  }
58  });
59 }
60 
61 template <block_size_t BlockSize>
62 class core_default {
63  static constexpr bool enable_vm_map = ITYR_ORI_ENABLE_VM_MAP;
64 
65 public:
66  core_default(std::size_t cache_size, std::size_t sub_block_size)
67  : noncoll_mem_(noncoll_allocator_size_option::value(), BlockSize),
68  home_manager_(calc_home_mmap_limit(cache_size / BlockSize)),
69  cache_manager_(cache_size, sub_block_size) {}
70 
71  static constexpr block_size_t block_size = BlockSize;
72 
73  void* malloc_coll(std::size_t size) { return malloc_coll<default_mem_mapper>(size); }
74 
75  template <template <block_size_t> typename MemMapper, typename... MemMapperArgs>
76  void* malloc_coll(std::size_t size, MemMapperArgs&&... mmargs) {
77  ITYR_REQUIRE_MESSAGE(size > 0, "Memory allocation size cannot be 0");
79  "The size passed to malloc_coll() is different among workers");
80 
81  auto mmapper = std::make_unique<MemMapper<BlockSize>>(size,
84  std::forward<MemMapperArgs>(mmargs)...);
85  coll_mem& cm = cm_manager_.create(size, std::move(mmapper));
86  void* addr = cm.vm().addr();
87 
88  common::verbose("Allocate collective memory [%p, %p) (%ld bytes) (win=%p)",
89  addr, reinterpret_cast<std::byte*>(addr) + size, size, &cm.win());
90 
91  return addr;
92  }
93 
94  void* malloc(std::size_t size) {
95  ITYR_CHECK_MESSAGE(size > 0, "Memory allocation size cannot be 0");
96 
97  void* addr = noncoll_mem_.allocate(size);
98 
99  common::verbose<2>("Allocate noncollective memory [%p, %p) (%ld bytes)",
100  addr, reinterpret_cast<std::byte*>(addr) + size, size);
101 
102  return addr;
103  }
104 
105  void free_coll(void* addr) {
106  ITYR_REQUIRE_MESSAGE(addr, "Null pointer was passed to free()");
108  "The address passed to free_coll() is different among workers");
109 
110  // ensure free safety
111  cache_manager_.ensure_all_cache_clean();
112 
113  coll_mem& cm = cm_manager_.get(addr);
114  ITYR_CHECK(addr == cm.vm().addr());
115 
116  // ensure all cache entries are evicted
117  for (std::size_t o = 0; o < cm.effective_size(); o += BlockSize) {
118  std::byte* addr = reinterpret_cast<std::byte*>(cm.vm().addr()) + o;
119  home_manager_.ensure_evicted(addr);
120  cache_manager_.ensure_evicted(addr);
121  }
122 
123  home_manager_.clear_tlb();
124  cache_manager_.clear_tlb();
125 
126  common::verbose("Deallocate collective memory [%p, %p) (%ld bytes) (win=%p)",
127  addr, reinterpret_cast<std::byte*>(addr) + cm.size(), cm.size(), &cm.win());
128 
129  cm_manager_.destroy(cm);
130  }
131 
132  // TODO: remove size from parameters
133  void free(void* addr, std::size_t size) {
134  ITYR_CHECK_MESSAGE(addr, "Null pointer was passed to free()");
135  ITYR_CHECK(noncoll_mem_.has(addr));
136 
137  common::topology::rank_t target_rank = noncoll_mem_.get_owner(addr);
138 
139  if (target_rank == common::topology::my_rank()) {
140  noncoll_mem_.local_deallocate(addr, size);
141 
142  common::verbose<2>("Deallocate noncollective memory [%p, %p) (%ld bytes) locally",
143  addr, reinterpret_cast<std::byte*>(addr) + size, size);
144 
145  } else {
146  // ensure dirty data of this memory object are discarded
147  for_each_block<BlockSize>(addr, size, [&](std::byte* blk_addr,
148  std::byte* req_addr_b,
149  std::byte* req_addr_e) {
150  cache_manager_.discard_dirty(blk_addr, req_addr_b, req_addr_e);
151  });
152 
153  noncoll_mem_.remote_deallocate(addr, size, target_rank);
154 
155  common::verbose<2>("Deallocate noncollective memory [%p, %p) (%ld bytes) remotely (rank=%d)",
156  addr, reinterpret_cast<std::byte*>(addr) + size, size, target_rank);
157  }
158  }
159 
160  void get(const void* from_addr, void* to_addr, std::size_t size) {
162 
163  std::byte* from_addr_ = reinterpret_cast<std::byte*>(const_cast<void*>(from_addr));
164 
165  // TODO: support get/put for data larger than the cache size
166  if (common::round_down_pow2(from_addr_, BlockSize) ==
167  common::round_down_pow2(from_addr_ + size, BlockSize)) {
168  // if the size is sufficiently small, it is safe to skip incrementing reference count for cache blocks
169  if (!checkout_impl_nb<mode::read_t, false>(from_addr_, size)) {
170  checkout_complete_impl();
171  }
172  get_copy_impl(from_addr_, reinterpret_cast<std::byte*>(to_addr), size);
173  } else {
174  if (!checkout_impl_nb<mode::read_t, true>(from_addr_, size)) {
175  checkout_complete_impl();
176  }
177  get_copy_impl(from_addr_, reinterpret_cast<std::byte*>(to_addr), size);
178  checkin_impl<mode::read_t, true>(from_addr_, size);
179  }
180  }
181 
182  void put(const void* from_addr, void* to_addr, std::size_t size) {
184 
185  std::byte* to_addr_ = reinterpret_cast<std::byte*>(to_addr);
186 
187  if (common::round_down_pow2(to_addr_, BlockSize) ==
188  common::round_down_pow2(to_addr_ + size, BlockSize)) {
189  // if the size is sufficiently small, it is safe to skip incrementing reference count for cache blocks
190  if (!checkout_impl_nb<mode::write_t, false>(to_addr_, size)) {
191  checkout_complete_impl();
192  }
193  put_copy_impl(reinterpret_cast<const std::byte*>(from_addr), to_addr_, size);
194  checkin_impl<mode::write_t, false>(to_addr_, size);
195  } else {
196  if (!checkout_impl_nb<mode::write_t, true>(to_addr_, size)) {
197  checkout_complete_impl();
198  }
199  put_copy_impl(reinterpret_cast<const std::byte*>(from_addr), to_addr_, size);
200  checkin_impl<mode::write_t, true>(to_addr_, size);
201  }
202  }
203 
204  template <typename Mode>
205  bool checkout_nb(void* addr, std::size_t size, Mode) {
206  if constexpr (!enable_vm_map) {
207  common::die("ITYR_ORI_ENABLE_VM_MAP must be true for core::checkout/checkin");
208  }
209 
211  common::verbose<2>("Checkout request (mode: %s) for [%p, %p) (%ld bytes)",
212  str(Mode{}).c_str(), addr, reinterpret_cast<std::byte*>(addr) + size, size);
213 
214  ITYR_CHECK(addr);
215  ITYR_CHECK(size > 0);
216 
217  return checkout_impl_nb<Mode, true>(reinterpret_cast<std::byte*>(addr), size);
218  }
219 
220  template <typename Mode>
221  void checkout(void* addr, std::size_t size, Mode mode) {
222  if (!checkout_nb(addr, size, mode)) {
224  }
225  }
226 
229  checkout_complete_impl();
230  }
231 
232  template <typename Mode>
233  void checkin(void* addr, std::size_t size, Mode) {
234  if constexpr (!enable_vm_map) {
235  common::die("ITYR_ORI_ENABLE_VM_MAP must be true for core::checkout/checkin");
236  }
237 
239  common::verbose<2>("Checkin request (mode: %s) for [%p, %p) (%ld bytes)",
240  str(Mode{}).c_str(), addr, reinterpret_cast<std::byte*>(addr) + size, size);
241 
242  ITYR_CHECK(addr);
243  ITYR_CHECK(size > 0);
244 
245  checkin_impl<Mode, true>(reinterpret_cast<std::byte*>(addr), size);
246  }
247 
248  void release() {
249  common::verbose("Release fence begin");
250 
251  cache_manager_.release();
252 
253  common::verbose("Release fence end");
254  }
255 
257 
259  common::verbose<2>("Lazy release handler is created");
260 
261  return cache_manager_.release_lazy();
262  }
263 
264  void acquire() {
265  common::verbose("Acquire fence begin");
266 
267  cache_manager_.acquire();
268 
269  common::verbose("Acquire fence end");
270  }
271 
273  common::verbose("Acquire fence (lazy) begin");
274 
275  cache_manager_.acquire(rh);
276 
277  common::verbose("Acquire fence (lazy) end");
278  }
279 
280  void set_readonly_coll(void* addr, std::size_t size) {
281  release();
283 
284  cache_manager_.set_readonly(addr, size);
285 
287  }
288 
289  void unset_readonly_coll(void* addr, std::size_t size) {
291 
292  cache_manager_.unset_readonly(addr, size);
293 
295  }
296 
297  void poll() {
298  cache_manager_.poll();
299  }
300 
302  noncoll_mem_.collect_deallocated();
303  }
304 
306  home_manager_.home_prof_begin();
307  cache_manager_.cache_prof_begin();
308  }
309 
310  void cache_prof_end() {
311  home_manager_.home_prof_end();
312  cache_manager_.cache_prof_end();
313  }
314 
315  void cache_prof_print() const {
316  home_manager_.home_prof_print();
317  cache_manager_.cache_prof_print();
318  }
319 
320  /* APIs for debugging */
321 
322  void* get_local_mem(void* addr) {
323  coll_mem& cm = cm_manager_.get(addr);
324  return cm.home_vm().addr();
325  }
326 
327 private:
328  std::size_t calc_home_mmap_limit(std::size_t n_cache_blocks) const {
329  std::size_t sys_limit = sys_mmap_entry_limit();
330  std::size_t margin = 1000;
331  ITYR_CHECK(sys_limit > 2 * n_cache_blocks + margin);
332 
333  std::size_t candidate = (sys_limit - 2 * n_cache_blocks - margin) / 2;
334  std::size_t max_val = 1024 * 1024; // some systems may have a too large vm.max_map_count value
335 
336  return std::min(max_val, candidate);
337  }
338 
339  template <typename Mode, bool IncrementRef>
340  bool checkout_impl_nb(std::byte* addr, std::size_t size) {
341  constexpr bool skip_fetch = std::is_same_v<Mode, mode::write_t>;
342  if (noncoll_mem_.has(addr)) {
343  return checkout_noncoll_nb<skip_fetch, IncrementRef>(addr, size);
344  } else {
345  return checkout_coll_nb<skip_fetch, IncrementRef>(addr, size);
346  }
347  }
348 
349  template <bool SkipFetch, bool IncrementRef>
350  bool checkout_coll_nb(std::byte* addr, std::size_t size) {
351  if (home_manager_.template checkout_fast<IncrementRef>(addr, size)) {
352  return true;
353  }
354 
355  auto [entry_found, fetch_completed] =
356  cache_manager_.template checkout_fast<SkipFetch, IncrementRef>(addr, size);
357  if (entry_found) {
358  return fetch_completed;
359  }
360 
361  coll_mem& cm = cm_manager_.get(addr);
362 
363  bool checkout_completed = true;
364 
365  for_each_seg_blk<BlockSize>(cm, addr, size,
366  // home segment
367  [&](std::byte* seg_addr, std::size_t seg_size, std::size_t pm_offset) {
368  checkout_completed &=
369  home_manager_.template checkout_seg<IncrementRef>(
370  seg_addr, seg_size, addr, size,
371  cm.home_pm(), pm_offset, cm.home_all_mapped());
372  },
373  // cache block
374  [&](std::byte* blk_addr, std::byte* req_addr_b, std::byte* req_addr_e,
375  common::topology::rank_t owner, std::size_t pm_offset) {
376  checkout_completed &=
377  cache_manager_.template checkout_blk<SkipFetch, IncrementRef>(
378  blk_addr, req_addr_b, req_addr_e,
379  cm.win(), common::topology::inter2global_rank(owner), pm_offset);
380  });
381 
382  return checkout_completed;
383  }
384 
385  template <bool SkipFetch, bool IncrementRef>
386  bool checkout_noncoll_nb(std::byte* addr, std::size_t size) {
387  ITYR_CHECK(noncoll_mem_.has(addr));
388 
389  auto target_rank = noncoll_mem_.get_owner(addr);
390  ITYR_CHECK(0 <= target_rank);
391  ITYR_CHECK(target_rank < common::topology::n_ranks());
392 
393  if (common::topology::is_locally_accessible(target_rank)) {
394  // There is no need to manage mmap entries for home blocks because
395  // the remotable allocator employs block distribution policy.
396  home_manager_.on_checkout_noncoll(size);
397  return true;
398  }
399 
400  auto [entry_found, fetch_completed] =
401  cache_manager_.template checkout_fast<SkipFetch, IncrementRef>(addr, size);
402  if (entry_found) {
403  return fetch_completed;
404  }
405 
406  bool checkout_completed = true;
407 
408  for_each_block<BlockSize>(addr, size, [&](std::byte* blk_addr,
409  std::byte* req_addr_b,
410  std::byte* req_addr_e) {
411  checkout_completed &=
412  cache_manager_.template checkout_blk<SkipFetch, IncrementRef>(
413  blk_addr, req_addr_b, req_addr_e,
414  noncoll_mem_.win(),
415  target_rank,
416  noncoll_mem_.get_disp(blk_addr));
417  });
418 
419  return checkout_completed;
420  }
421 
422  template <typename Mode, bool DecrementRef>
423  void checkin_impl(std::byte* addr, std::size_t size) {
424  constexpr bool register_dirty = !std::is_same_v<Mode, mode::read_t>;
425  if (noncoll_mem_.has(addr)) {
426  checkin_noncoll<register_dirty, DecrementRef>(addr, size);
427  } else {
428  checkin_coll<register_dirty, DecrementRef>(addr, size);
429  }
430  }
431 
432  void checkout_complete_impl() {
433  home_manager_.checkout_complete();
434  cache_manager_.checkout_complete();
435  }
436 
437  template <bool RegisterDirty, bool DecrementRef>
438  void checkin_coll(std::byte* addr, std::size_t size) {
439  if (home_manager_.template checkin_fast<DecrementRef>(addr, size)) {
440  return;
441  }
442 
443  if (cache_manager_.template checkin_fast<RegisterDirty, DecrementRef>(addr, size)) {
444  return;
445  }
446 
447  coll_mem& cm = cm_manager_.get(addr);
448 
449  for_each_seg_blk<BlockSize>(cm, addr, size,
450  // home segment
451  [&](std::byte* seg_addr, std::size_t, std::size_t) {
452  home_manager_.template checkin_seg<DecrementRef>(seg_addr, cm.home_all_mapped());
453  },
454  // cache block
455  [&](std::byte* blk_addr, std::byte* req_addr_b, std::byte* req_addr_e,
456  common::topology::rank_t, std::size_t) {
457  cache_manager_.template checkin_blk<RegisterDirty, DecrementRef>(
458  blk_addr, req_addr_b, req_addr_e);
459  });
460  }
461 
462  template <bool RegisterDirty, bool DecrementRef>
463  void checkin_noncoll(std::byte* addr, std::size_t size) {
464  ITYR_CHECK(noncoll_mem_.has(addr));
465 
466  auto target_rank = noncoll_mem_.get_owner(addr);
467  ITYR_CHECK(0 <= target_rank);
468  ITYR_CHECK(target_rank < common::topology::n_ranks());
469 
470  if (common::topology::is_locally_accessible(target_rank)) {
471  // There is no need to manage mmap entries for home blocks because
472  // the remotable allocator employs block distribution policy.
473  return;
474  }
475 
476  if (cache_manager_.template checkin_fast<RegisterDirty, DecrementRef>(addr, size)) {
477  return;
478  }
479 
480  for_each_block<BlockSize>(addr, size, [&](std::byte* blk_addr,
481  std::byte* req_addr_b,
482  std::byte* req_addr_e) {
483  cache_manager_.template checkin_blk<RegisterDirty, DecrementRef>(
484  blk_addr, req_addr_b, req_addr_e);
485  });
486  }
487 
488  /*
489  * The following get/put_copy_* functions are mainly for performance evaluation for cases
490  * in which virtual memory mapping is not used and instead the data are always copied between
491  * the cache region and the user buffer via GET/PUT calls. Thus, checkout/checkin cannot be
492  * used when enable_vm_map is false.
493  */
494 
495  void get_copy_impl(std::byte* from_addr, std::byte* to_addr, std::size_t size) {
496  if constexpr (enable_vm_map) {
497  std::memcpy(to_addr, from_addr, size);
498  } else if (noncoll_mem_.has(from_addr)) {
499  get_copy_noncoll(from_addr, to_addr, size);
500  } else {
501  get_copy_coll(from_addr, to_addr, size);
502  }
503  }
504 
505  void get_copy_coll(std::byte* from_addr, std::byte* to_addr, std::size_t size) {
506  ITYR_CHECK(!enable_vm_map);
507 
508  coll_mem& cm = cm_manager_.get(from_addr);
509 
510  for_each_seg_blk<BlockSize>(cm, from_addr, size,
511  // home segment
512  [&](std::byte* seg_addr, std::size_t seg_size, std::size_t pm_offset) {
513  const common::virtual_mem& vm = cm.home_vm();
514  std::byte* seg_addr_b = std::max(from_addr, seg_addr);
515  std::byte* seg_addr_e = std::min(seg_addr + seg_size, from_addr + size);
516  std::size_t seg_offset = seg_addr_b - seg_addr;
517  std::byte* from_addr_ = reinterpret_cast<std::byte*>(vm.addr()) + pm_offset + seg_offset;
518  std::byte* to_addr_ = to_addr + (seg_addr_b - from_addr);
519  std::memcpy(to_addr_, from_addr_, seg_addr_e - seg_addr_b);
520  },
521  // cache block
522  [&](std::byte* blk_addr, std::byte* req_addr_b, std::byte* req_addr_e,
523  common::topology::rank_t, std::size_t) {
524  cache_manager_.get_copy_blk(blk_addr, req_addr_b, req_addr_e, to_addr + (req_addr_b - from_addr));
525  });
526  }
527 
528  void get_copy_noncoll(std::byte* from_addr, std::byte* to_addr, std::size_t size) {
529  ITYR_CHECK(!enable_vm_map);
530 
531  ITYR_CHECK(noncoll_mem_.has(from_addr));
532 
533  auto target_rank = noncoll_mem_.get_owner(from_addr);
534  ITYR_CHECK(0 <= target_rank);
535  ITYR_CHECK(target_rank < common::topology::n_ranks());
536 
537  if (common::topology::is_locally_accessible(target_rank)) {
538  std::memcpy(to_addr, from_addr, size);
539  return;
540  }
541 
542  for_each_block<BlockSize>(from_addr, size, [&](std::byte* blk_addr,
543  std::byte* req_addr_b,
544  std::byte* req_addr_e) {
545  cache_manager_.get_copy_blk(blk_addr, req_addr_b, req_addr_e, to_addr + (req_addr_b - from_addr));
546  });
547  }
548 
549  void put_copy_impl(const std::byte* from_addr, std::byte* to_addr, std::size_t size) {
550  if constexpr (enable_vm_map) {
551  std::memcpy(to_addr, from_addr, size);
552  } else if (noncoll_mem_.has(to_addr)) {
553  put_copy_noncoll(from_addr, to_addr, size);
554  } else {
555  put_copy_coll(from_addr, to_addr, size);
556  }
557  }
558 
559  void put_copy_coll(const std::byte* from_addr, std::byte* to_addr, std::size_t size) {
560  ITYR_CHECK(!enable_vm_map);
561 
562  coll_mem& cm = cm_manager_.get(to_addr);
563 
564  for_each_seg_blk<BlockSize>(cm, to_addr, size,
565  // home segment
566  [&](std::byte* seg_addr, std::size_t seg_size, std::size_t pm_offset) {
567  const common::virtual_mem& vm = cm.home_vm();
568  std::byte* seg_addr_b = std::max(to_addr, seg_addr);
569  std::byte* seg_addr_e = std::min(seg_addr + seg_size, to_addr + size);
570  std::size_t seg_offset = seg_addr_b - seg_addr;
571  const std::byte* from_addr_ = from_addr + (seg_addr_b - to_addr);
572  std::byte* to_addr_ = reinterpret_cast<std::byte*>(vm.addr()) + pm_offset + seg_offset;
573  std::memcpy(to_addr_, from_addr_, seg_addr_e - seg_addr_b);
574  },
575  // cache block
576  [&](std::byte* blk_addr, std::byte* req_addr_b, std::byte* req_addr_e,
577  common::topology::rank_t, std::size_t) {
578  cache_manager_.put_copy_blk(blk_addr, req_addr_b, req_addr_e, from_addr + (req_addr_b - to_addr));
579  });
580  }
581 
582  void put_copy_noncoll(const std::byte* from_addr, std::byte* to_addr, std::size_t size) {
583  ITYR_CHECK(!enable_vm_map);
584 
585  ITYR_CHECK(noncoll_mem_.has(to_addr));
586 
587  auto target_rank = noncoll_mem_.get_owner(to_addr);
588  ITYR_CHECK(0 <= target_rank);
589  ITYR_CHECK(target_rank < common::topology::n_ranks());
590 
591  if (common::topology::is_locally_accessible(target_rank)) {
592  std::memcpy(to_addr, from_addr, size);
593  return;
594  }
595 
596  for_each_block<BlockSize>(to_addr, size, [&](std::byte* blk_addr,
597  std::byte* req_addr_b,
598  std::byte* req_addr_e) {
599  cache_manager_.put_copy_blk(blk_addr, req_addr_b, req_addr_e, from_addr + (req_addr_b - to_addr));
600  });
601  }
602 
603  template <block_size_t BS>
604  using default_mem_mapper = mem_mapper::ITYR_ORI_DEFAULT_MEM_MAPPER<BS>;
605 
606  coll_mem_manager cm_manager_;
607  noncoll_mem noncoll_mem_;
608  home_manager<BlockSize> home_manager_;
609  cache_manager<BlockSize> cache_manager_;
610 };
611 
612 template <block_size_t BlockSize>
614 public:
615  core_nocache(std::size_t, std::size_t)
616  : noncoll_mem_(noncoll_allocator_size_option::value(), BlockSize) {}
617 
618  static constexpr block_size_t block_size = BlockSize;
619 
620  void* malloc_coll(std::size_t size) { return malloc_coll<default_mem_mapper>(size); }
621 
622  template <template <block_size_t> typename MemMapper, typename... MemMapperArgs>
623  void* malloc_coll(std::size_t size, MemMapperArgs&&... mmargs) {
624  if (size == 0) {
625  common::die("Memory allocation size cannot be 0");
626  }
627 
628  auto mmapper = std::make_unique<MemMapper<BlockSize>>(size,
631  std::forward<MemMapperArgs>(mmargs)...);
632  coll_mem& cm = cm_manager_.create(size, std::move(mmapper));
633  void* addr = cm.vm().addr();
634 
635  common::verbose("Allocate collective memory [%p, %p) (%ld bytes) (win=%p)",
636  addr, reinterpret_cast<std::byte*>(addr) + size, size, &cm.win());
637 
638  return addr;
639  }
640 
641  void* malloc(std::size_t size) {
642  ITYR_CHECK_MESSAGE(size > 0, "Memory allocation size cannot be 0");
643 
644  void* addr = noncoll_mem_.allocate(size);
645 
646  common::verbose<2>("Allocate noncollective memory [%p, %p) (%ld bytes)",
647  addr, reinterpret_cast<std::byte*>(addr) + size, size);
648 
649  return addr;
650  }
651 
652  void free_coll(void* addr) {
653  if (!addr) {
654  common::die("Null pointer was passed to free_coll()");
655  }
656 
657  coll_mem& cm = cm_manager_.get(addr);
658  ITYR_CHECK(addr == cm.vm().addr());
659 
660  common::verbose("Deallocate collective memory [%p, %p) (%ld bytes) (win=%p)",
661  addr, reinterpret_cast<std::byte*>(addr) + cm.size(), cm.size(), &cm.win());
662 
663  cm_manager_.destroy(cm);
664  }
665 
666  void free(void* addr, std::size_t size) {
667  ITYR_CHECK_MESSAGE(addr, "Null pointer was passed to free()");
668  ITYR_CHECK(noncoll_mem_.has(addr));
669 
670  common::topology::rank_t target_rank = noncoll_mem_.get_owner(addr);
671 
672  if (target_rank == common::topology::my_rank()) {
673  noncoll_mem_.local_deallocate(addr, size);
674 
675  common::verbose<2>("Deallocate noncollective memory [%p, %p) (%ld bytes) locally",
676  addr, reinterpret_cast<std::byte*>(addr) + size, size);
677 
678  } else {
679  noncoll_mem_.remote_deallocate(addr, size, target_rank);
680 
681  common::verbose<2>("Deallocate noncollective memory [%p, %p) (%ld bytes) remotely (rank=%d)",
682  addr, reinterpret_cast<std::byte*>(addr) + size, size, target_rank);
683  }
684  }
685 
686  void get(const void* from_addr, void* to_addr, std::size_t size) {
688 
689  std::byte* from_addr_ = reinterpret_cast<std::byte*>(const_cast<void*>(from_addr));
690  get_impl(from_addr_, reinterpret_cast<std::byte*>(to_addr), size);
691  }
692 
693  void put(const void* from_addr, void* to_addr, std::size_t size) {
695 
696  std::byte* to_addr_ = reinterpret_cast<std::byte*>(to_addr);
697  put_impl(reinterpret_cast<const std::byte*>(from_addr), to_addr_, size);
698  }
699 
700  template <typename Mode>
701  bool checkout_nb(void*, std::size_t, Mode) {
702  common::die("core::checkout/checkin is disabled");
703  }
704 
705  template <typename Mode>
706  void checkout(void*, std::size_t, Mode) {
707  common::die("core::checkout/checkin is disabled");
708  }
709 
711  common::die("core::checkout/checkin is disabled");
712  }
713 
714  template <typename Mode>
715  void checkin(void*, std::size_t, Mode) {
716  common::die("core::checkout/checkin is disabled");
717  }
718 
719  void release() {}
720 
721  using release_handler = void*;
722 
723  release_handler release_lazy() { return {}; }
724 
725  void acquire() {}
726 
728 
729  void set_readonly_coll(void*, std::size_t) {}
730  void unset_readonly_coll(void*, std::size_t) {}
731 
732  void poll() {}
733 
735  noncoll_mem_.collect_deallocated();
736  }
737 
739  void cache_prof_end() {}
740  void cache_prof_print() const {}
741 
742  /* APIs for debugging */
743 
744  void* get_local_mem(void* addr) {
745  coll_mem& cm = cm_manager_.get(addr);
746  return cm.home_vm().addr();
747  }
748 
749 private:
750  void get_impl(std::byte* from_addr, std::byte* to_addr, std::size_t size) {
751  if (noncoll_mem_.has(from_addr)) {
752  get_noncoll(from_addr, to_addr, size);
753  } else {
754  get_coll(from_addr, to_addr, size);
755  }
756  }
757 
758  void get_coll(std::byte* from_addr, std::byte* to_addr, std::size_t size) {
759  coll_mem& cm = cm_manager_.get(from_addr);
760 
761  bool fetching = false;
762 
763  for_each_seg_blk<BlockSize>(cm, from_addr, size,
764  // home segment
765  [&](std::byte* seg_addr, std::size_t seg_size, std::size_t pm_offset) {
766  const common::virtual_mem& vm = cm.home_vm();
767  std::byte* seg_addr_b = std::max(from_addr, seg_addr);
768  std::byte* seg_addr_e = std::min(seg_addr + seg_size, from_addr + size);
769  std::size_t seg_offset = seg_addr_b - seg_addr;
770  std::byte* from_addr_ = reinterpret_cast<std::byte*>(vm.addr()) + pm_offset + seg_offset;
771  std::byte* to_addr_ = to_addr + (seg_addr_b - from_addr);
772  std::memcpy(to_addr_, from_addr_, seg_addr_e - seg_addr_b);
773  },
774  // cache block
775  [&](std::byte* blk_addr, std::byte* req_addr_b, std::byte* req_addr_e,
776  common::topology::rank_t owner, std::size_t pm_offset) {
777  common::rma::get_nb(to_addr + (req_addr_b - from_addr), req_addr_e - req_addr_b,
778  cm.win(), common::topology::inter2global_rank(owner),
779  pm_offset + (req_addr_b - blk_addr));
780  fetching = true;
781  });
782 
783  if (fetching) {
784  common::rma::flush(cm.win());
785  }
786  }
787 
788  void get_noncoll(std::byte* from_addr, std::byte* to_addr, std::size_t size) {
789  ITYR_CHECK(noncoll_mem_.has(from_addr));
790 
791  auto target_rank = noncoll_mem_.get_owner(from_addr);
792  ITYR_CHECK(0 <= target_rank);
793  ITYR_CHECK(target_rank < common::topology::n_ranks());
794 
795  if (common::topology::is_locally_accessible(target_rank)) {
796  std::memcpy(to_addr, from_addr, size);
797  return;
798  }
799 
800  for_each_block<BlockSize>(from_addr, size, [&](std::byte* blk_addr,
801  std::byte* req_addr_b,
802  std::byte* req_addr_e) {
803  common::rma::get_nb(to_addr + (req_addr_b - from_addr), req_addr_e - req_addr_b,
804  noncoll_mem_.win(), target_rank,
805  noncoll_mem_.get_disp(blk_addr) + (req_addr_b - blk_addr));
806  });
807 
808  common::rma::flush(noncoll_mem_.win());
809  }
810 
811  void put_impl(const std::byte* from_addr, std::byte* to_addr, std::size_t size) {
812  if (noncoll_mem_.has(to_addr)) {
813  put_noncoll(from_addr, to_addr, size);
814  } else {
815  put_coll(from_addr, to_addr, size);
816  }
817  }
818 
819  void put_coll(const std::byte* from_addr, std::byte* to_addr, std::size_t size) {
820  coll_mem& cm = cm_manager_.get(to_addr);
821 
822  bool putting = false;
823 
824  for_each_seg_blk<BlockSize>(cm, to_addr, size,
825  // home segment
826  [&](std::byte* seg_addr, std::size_t seg_size, std::size_t pm_offset) {
827  const common::virtual_mem& vm = cm.home_vm();
828  std::byte* seg_addr_b = std::max(to_addr, seg_addr);
829  std::byte* seg_addr_e = std::min(seg_addr + seg_size, to_addr + size);
830  std::size_t seg_offset = seg_addr_b - seg_addr;
831  const std::byte* from_addr_ = from_addr + (seg_addr_b - to_addr);
832  std::byte* to_addr_ = reinterpret_cast<std::byte*>(vm.addr()) + pm_offset + seg_offset;
833  std::memcpy(to_addr_, from_addr_, seg_addr_e - seg_addr_b);
834  },
835  // cache block
836  [&](std::byte* blk_addr, std::byte* req_addr_b, std::byte* req_addr_e,
837  common::topology::rank_t owner, std::size_t pm_offset) {
838  common::rma::put_nb(from_addr + (req_addr_b - to_addr), req_addr_e - req_addr_b,
839  cm.win(), common::topology::inter2global_rank(owner),
840  pm_offset + (req_addr_b - blk_addr));
841  putting = true;
842  });
843 
844  if (putting) {
845  common::rma::flush(cm.win());
846  }
847  }
848 
849  void put_noncoll(const std::byte* from_addr, std::byte* to_addr, std::size_t size) {
850  ITYR_CHECK(noncoll_mem_.has(to_addr));
851 
852  auto target_rank = noncoll_mem_.get_owner(to_addr);
853  ITYR_CHECK(0 <= target_rank);
854  ITYR_CHECK(target_rank < common::topology::n_ranks());
855 
856  if (common::topology::is_locally_accessible(target_rank)) {
857  std::memcpy(to_addr, from_addr, size);
858  return;
859  }
860 
861  for_each_block<BlockSize>(to_addr, size, [&](std::byte* blk_addr,
862  std::byte* req_addr_b,
863  std::byte* req_addr_e) {
864  common::rma::put_nb(from_addr + (req_addr_b - to_addr), req_addr_e - req_addr_b,
865  noncoll_mem_.win(), target_rank,
866  noncoll_mem_.get_disp(blk_addr) + (req_addr_b - blk_addr));
867  });
868 
869  common::rma::flush(noncoll_mem_.win());
870  }
871 
872  template <block_size_t BS>
873  using default_mem_mapper = mem_mapper::ITYR_ORI_DEFAULT_MEM_MAPPER<BS>;
874 
875  coll_mem_manager cm_manager_;
876  noncoll_mem noncoll_mem_;
877 };
878 
879 template <block_size_t BlockSize>
880 class core_serial {
881 public:
882  core_serial(std::size_t, std::size_t) {}
883 
884  static constexpr block_size_t block_size = BlockSize;
885 
886  void* malloc_coll(std::size_t size) { return std::malloc(size); }
887 
888  template <template <block_size_t> typename MemMapper, typename... MemMapperArgs>
889  void* malloc_coll(std::size_t size, MemMapperArgs&&...) {
890  return std::malloc(size);
891  }
892 
893  void* malloc(std::size_t size) {
894  return std::malloc(size);
895  }
896 
897  void free_coll(void* addr) {
898  std::free(addr);
899  }
900 
901  void free(void* addr, std::size_t) {
902  std::free(addr);
903  }
904 
905  void get(const void* from_addr, void* to_addr, std::size_t size) {
906  std::memcpy(to_addr, from_addr, size);
907  }
908 
909  void put(const void* from_addr, void* to_addr, std::size_t size) {
910  std::memcpy(to_addr, from_addr, size);
911  }
912 
913  template <typename Mode>
914  bool checkout_nb(void*, std::size_t, Mode) { return true; }
915 
916  template <typename Mode>
917  void checkout(void*, std::size_t, Mode) {}
918 
920 
921  template <typename Mode>
922  void checkin(void*, std::size_t, Mode) {}
923 
924  void release() {}
925 
926  using release_handler = void*;
927 
928  release_handler release_lazy() { return {}; }
929 
930  void acquire() {}
931 
933 
934  void set_readonly_coll(void*, std::size_t) {}
935  void unset_readonly_coll(void*, std::size_t) {}
936 
937  void poll() {}
938 
940 
942  void cache_prof_end() {}
943  void cache_prof_print() const {}
944 
945  /* APIs for debugging */
946 
947  void* get_local_mem(void* addr) { return addr; }
948 };
949 
950 template <block_size_t BlockSize>
951 using core = ITYR_CONCAT(core_, ITYR_ORI_CORE)<BlockSize>;
952 
954 
955 ITYR_TEST_CASE("[ityr::ori::core] malloc/free with block policy") {
956  common::runtime_options common_opts;
957  runtime_options opts;
960  constexpr block_size_t bs = 65536;
961  core<bs> c(16 * bs, bs / 4);
962 
963  ITYR_SUBCASE("free immediately") {
964  int n = 10;
965  for (int i = 1; i < n; i++) {
966  auto p = c.malloc_coll<mem_mapper::block>(i * 1234);
967  c.free_coll(p);
968  }
969  }
970 
971  ITYR_SUBCASE("free after accumulation") {
972  int n = 10;
973  void* ptrs[n];
974  for (int i = 1; i < n; i++) {
975  ptrs[i] = c.malloc_coll<mem_mapper::block>(i * 2743);
976  }
977  for (int i = 1; i < n; i++) {
978  c.free_coll(ptrs[i]);
979  }
980  }
981 }
982 
983 ITYR_TEST_CASE("[ityr::ori::core] malloc/free with cyclic policy") {
984  common::runtime_options common_opts;
985  runtime_options opts;
988  constexpr block_size_t bs = 65536;
989  core<bs> c(16 * bs, bs / 4);
990 
991  ITYR_SUBCASE("free immediately") {
992  int n = 10;
993  for (int i = 1; i < n; i++) {
994  auto p = c.malloc_coll<mem_mapper::cyclic>(i * 123456);
995  c.free_coll(p);
996  }
997  }
998 
999  ITYR_SUBCASE("free after accumulation") {
1000  int n = 10;
1001  void* ptrs[n];
1002  for (int i = 1; i < n; i++) {
1003  ptrs[i] = c.malloc_coll<mem_mapper::cyclic>(i * 27438, bs * i);
1004  }
1005  for (int i = 1; i < n; i++) {
1006  c.free_coll(ptrs[i]);
1007  }
1008  }
1009 }
1010 
1011 ITYR_TEST_CASE("[ityr::ori::core] malloc and free (noncollective)") {
1012  common::runtime_options common_opts;
1013  runtime_options opts;
1014  common::singleton_initializer<common::topology::instance> topo;
1015  common::singleton_initializer<common::rma::instance> rma;
1016  constexpr block_size_t bs = 65536;
1017  core<bs> c(16 * bs, bs / 4);
1018 
1019  constexpr int n = 10;
1020  ITYR_SUBCASE("free immediately") {
1021  for (int i = 0; i < n; i++) {
1022  void* p = c.malloc(std::size_t(1) << i);
1023  c.free(p, std::size_t(1) << i);
1024  }
1025  }
1026 
1027  ITYR_SUBCASE("free after accumulation") {
1028  void* ptrs[n];
1029  for (int i = 0; i < n; i++) {
1030  ptrs[i] = c.malloc(std::size_t(1) << i);
1031  }
1032  for (int i = 0; i < n; i++) {
1033  c.free(ptrs[i], std::size_t(1) << i);
1034  }
1035  }
1036 
1037  ITYR_SUBCASE("remote free") {
1038  void* ptrs_send[n];
1039  void* ptrs_recv[n];
1040  for (int i = 0; i < n; i++) {
1041  ptrs_send[i] = c.malloc(std::size_t(1) << i);
1042  }
1043 
1047 
1048  auto req_send = common::mpi_isend(ptrs_send, n, (n_ranks + my_rank + 1) % n_ranks, 0, mpicomm);
1049  auto req_recv = common::mpi_irecv(ptrs_recv, n, (n_ranks + my_rank - 1) % n_ranks, 0, mpicomm);
1050  common::mpi_wait(req_send);
1051  common::mpi_wait(req_recv);
1052 
1053  for (int i = 0; i < n; i++) {
1054  c.free(ptrs_recv[i], std::size_t(1) << i);
1055  }
1056  }
1057 }
1058 
1059 ITYR_TEST_CASE("[ityr::ori::core] get/put") {
1060  common::runtime_options common_opts;
1061  runtime_options opts;
1062  common::singleton_initializer<common::topology::instance> topo;
1063  common::singleton_initializer<common::rma::instance> rma;
1064  constexpr block_size_t bs = 65536;
1065  int n_cb = 16;
1066  core<bs> c(n_cb * bs, bs / 4);
1067 
1069 
1070  std::size_t n = n_cb * bs / sizeof(std::size_t);
1071 
1072  std::size_t* ps[2];
1073  ps[0] = reinterpret_cast<std::size_t*>(c.malloc_coll<mem_mapper::block >(n * sizeof(std::size_t)));
1074  ps[1] = reinterpret_cast<std::size_t*>(c.malloc_coll<mem_mapper::cyclic>(n * sizeof(std::size_t)));
1075 
1076  std::size_t* buf = new std::size_t[n + 2];
1077 
1078  auto barrier = [&]() {
1079  c.release();
1081  c.acquire();
1082  };
1083 
1084  for (auto p : ps) {
1085  if (my_rank == 0) {
1086  for (std::size_t i = 0; i < n; i++) {
1087  buf[i] = i;
1088  }
1089  c.put(buf, p, n * sizeof(std::size_t));
1090  }
1091 
1092  barrier();
1093 
1094  ITYR_SUBCASE("get the entire array") {
1095  std::size_t special = 417;
1096  buf[0] = buf[n + 1] = special;
1097 
1098  c.get(p, buf + 1, n * sizeof(std::size_t));
1099 
1100  for (std::size_t i = 0; i < n; i++) {
1101  ITYR_CHECK(buf[i + 1] == i);
1102  }
1103  ITYR_CHECK(buf[0] == special);
1104  ITYR_CHECK(buf[n + 1] == special);
1105  }
1106 
1107  ITYR_SUBCASE("get the partial array") {
1108  std::size_t ib = n / 5 * 2;
1109  std::size_t ie = n / 5 * 4;
1110  std::size_t s = ie - ib;
1111 
1112  std::size_t special = 417;
1113  buf[0] = buf[s + 1] = special;
1114 
1115  c.get(p + ib, buf + 1, s * sizeof(std::size_t));
1116 
1117  for (std::size_t i = 0; i < s; i++) {
1118  ITYR_CHECK(buf[i + 1] == i + ib);
1119  }
1120  ITYR_CHECK(buf[0] == special);
1121  ITYR_CHECK(buf[s + 1] == special);
1122  }
1123 
1124  ITYR_SUBCASE("get each element") {
1125  for (std::size_t i = 0; i < n; i++) {
1126  std::size_t special = 417;
1127  buf[0] = buf[2] = special;
1128  c.get(p + i, &buf[1], sizeof(std::size_t));
1129  ITYR_CHECK(buf[0] == special);
1130  ITYR_CHECK(buf[1] == i);
1131  ITYR_CHECK(buf[2] == special);
1132  }
1133  }
1134  }
1135 
1136  delete[] buf;
1137 
1138  c.free_coll(ps[0]);
1139  c.free_coll(ps[1]);
1140 }
1141 
1142 ITYR_TEST_CASE("[ityr::ori::core] checkout/checkin (small, aligned)") {
1143  common::runtime_options common_opts;
1144  runtime_options opts;
1145  common::singleton_initializer<common::topology::instance> topo;
1146  common::singleton_initializer<common::rma::instance> rma;
1147  constexpr block_size_t bs = 65536;
1148  int n_cb = 16;
1149  core<bs> c(n_cb * bs, bs / 4);
1150 
1153 
1154  int n = bs * n_ranks;
1155  uint8_t* ps[2];
1156  ps[0] = reinterpret_cast<uint8_t*>(c.malloc_coll<mem_mapper::block >(n));
1157  ps[1] = reinterpret_cast<uint8_t*>(c.malloc_coll<mem_mapper::cyclic>(n));
1158 
1159  auto barrier = [&]() {
1160  c.release();
1162  c.acquire();
1163  };
1164 
1165  for (auto p : ps) {
1166  uint8_t* home_ptr = reinterpret_cast<uint8_t*>(c.get_local_mem(p));
1167  for (std::size_t i = 0; i < bs; i++) {
1168  home_ptr[i] = my_rank;
1169  }
1170 
1171  barrier();
1172 
1173  ITYR_SUBCASE("read the entire array") {
1174  c.checkout(p, n, mode::read);
1175  for (int i = 0; i < n; i++) {
1176  ITYR_CHECK_MESSAGE(p[i] == i / bs, "rank: ", my_rank, ", i: ", i);
1177  }
1178  c.checkin(p, n, mode::read);
1179  }
1180 
1181  ITYR_SUBCASE("read and write the entire array") {
1182  for (int iter = 0; iter < n_ranks; iter++) {
1183  if (iter == my_rank) {
1184  c.checkout(p, n, mode::read_write);
1185  for (int i = 0; i < n; i++) {
1186  ITYR_CHECK_MESSAGE(p[i] == i / bs + iter, "iter: ", iter, ", rank: ", my_rank, ", i: ", i);
1187  p[i]++;
1188  }
1189  c.checkin(p, n, mode::read_write);
1190  }
1191 
1192  barrier();
1193 
1194  c.checkout(p, n, mode::read);
1195  for (int i = 0; i < n; i++) {
1196  ITYR_CHECK_MESSAGE(p[i] == i / bs + iter + 1, "iter: ", iter, ", rank: ", my_rank, ", i: ", i);
1197  }
1198  c.checkin(p, n, mode::read);
1199 
1200  barrier();
1201  }
1202  }
1203 
1204  ITYR_SUBCASE("read the partial array") {
1205  int ib = n / 5 * 2;
1206  int ie = n / 5 * 4;
1207  int s = ie - ib;
1208 
1209  c.checkout(p + ib, s, mode::read);
1210  for (int i = 0; i < s; i++) {
1211  ITYR_CHECK_MESSAGE(p[ib + i] == (i + ib) / bs, "rank: ", my_rank, ", i: ", i);
1212  }
1213  c.checkin(p + ib, s, mode::read);
1214  }
1215  }
1216 
1217  c.free_coll(ps[0]);
1218  c.free_coll(ps[1]);
1219 }
1220 
1221 ITYR_TEST_CASE("[ityr::ori::core] checkout/checkin (large, not aligned)") {
1222  common::runtime_options common_opts;
1223  runtime_options opts;
1224  common::singleton_initializer<common::topology::instance> topo;
1225  common::singleton_initializer<common::rma::instance> rma;
1226  constexpr block_size_t bs = 65536;
1227  int n_cb = 16;
1228  core<bs> c(n_cb * bs, bs / 4);
1229 
1232 
1233  std::size_t n = 10 * n_cb * bs / sizeof(std::size_t);
1234 
1235  std::size_t* ps[2];
1236  ps[0] = reinterpret_cast<std::size_t*>(c.malloc_coll<mem_mapper::block >(n * sizeof(std::size_t)));
1237  ps[1] = reinterpret_cast<std::size_t*>(c.malloc_coll<mem_mapper::cyclic>(n * sizeof(std::size_t)));
1238 
1239  std::size_t max_checkout_size = (16 - 2) * bs / sizeof(std::size_t);
1240 
1241  auto barrier = [&]() {
1242  c.release();
1244  c.acquire();
1245  };
1246 
1247  for (auto p : ps) {
1248  if (my_rank == 0) {
1249  for (std::size_t i = 0; i < n; i += max_checkout_size) {
1250  std::size_t m = std::min(max_checkout_size, n - i);
1251  c.checkout(p + i, m * sizeof(std::size_t), mode::write);
1252  for (std::size_t j = i; j < i + m; j++) {
1253  p[j] = j;
1254  }
1255  c.checkin(p + i, m * sizeof(std::size_t), mode::write);
1256  }
1257  }
1258 
1259  barrier();
1260 
1261  ITYR_SUBCASE("read the entire array") {
1262  for (std::size_t i = 0; i < n; i += max_checkout_size) {
1263  std::size_t m = std::min(max_checkout_size, n - i);
1264  c.checkout(p + i, m * sizeof(std::size_t), mode::read);
1265  for (std::size_t j = i; j < i + m; j++) {
1266  ITYR_CHECK(p[j] == j);
1267  }
1268  c.checkin(p + i, m * sizeof(std::size_t), mode::read);
1269  }
1270  }
1271 
1272  ITYR_SUBCASE("read the partial array") {
1273  std::size_t ib = n / 5 * 2;
1274  std::size_t ie = n / 5 * 4;
1275  std::size_t s = ie - ib;
1276 
1277  for (std::size_t i = 0; i < s; i += max_checkout_size) {
1278  std::size_t m = std::min(max_checkout_size, s - i);
1279  c.checkout(p + ib + i, m * sizeof(std::size_t), mode::read);
1280  for (std::size_t j = ib + i; j < ib + i + m; j++) {
1281  ITYR_CHECK(p[j] == j);
1282  }
1283  c.checkin(p + ib + i, m * sizeof(std::size_t), mode::read);
1284  }
1285  }
1286 
1287  ITYR_SUBCASE("read and write the partial array") {
1288  std::size_t stride = 48;
1289  ITYR_REQUIRE(stride <= max_checkout_size);
1290  for (std::size_t i = my_rank * stride; i < n; i += n_ranks * stride) {
1291  std::size_t s = std::min(stride, n - i);
1292  c.checkout(p + i, s * sizeof(std::size_t), mode::read_write);
1293  for (std::size_t j = i; j < i + s; j++) {
1294  ITYR_CHECK(p[j] == j);
1295  p[j] *= 2;
1296  }
1297  c.checkin(p + i, s * sizeof(std::size_t), mode::read_write);
1298  }
1299 
1300  barrier();
1301 
1302  for (std::size_t i = 0; i < n; i += max_checkout_size) {
1303  std::size_t m = std::min(max_checkout_size, n - i);
1304  c.checkout(p + i, m * sizeof(std::size_t), mode::read);
1305  for (std::size_t j = i; j < i + m; j++) {
1306  ITYR_CHECK(p[j] == j * 2);
1307  }
1308  c.checkin(p + i, m * sizeof(std::size_t), mode::read);
1309  }
1310  }
1311  }
1312 
1313  c.free_coll(ps[0]);
1314  c.free_coll(ps[1]);
1315 }
1316 
1317 ITYR_TEST_CASE("[ityr::ori::core] checkout/checkin (noncontig)") {
1318  common::runtime_options common_opts;
1319  runtime_options opts;
1320  common::singleton_initializer<common::topology::instance> topo;
1321  common::singleton_initializer<common::rma::instance> rma;
1322  constexpr block_size_t bs = 65536;
1323  int n_cb = 8;
1324  core<bs> c(n_cb * bs, bs / 4);
1325 
1328 
1329  std::size_t n = 2 * n_cb * bs / sizeof(std::size_t);
1330 
1331  std::size_t* ps[2];
1332  ps[0] = reinterpret_cast<std::size_t*>(c.malloc_coll<mem_mapper::block >(n * sizeof(std::size_t)));
1333  ps[1] = reinterpret_cast<std::size_t*>(c.malloc_coll<mem_mapper::cyclic>(n * sizeof(std::size_t)));
1334 
1335  auto barrier = [&]() {
1336  c.release();
1338  c.acquire();
1339  };
1340 
1341  for (auto p : ps) {
1342  for (std::size_t i = my_rank; i < n; i += n_ranks) {
1343  c.checkout(p + i, sizeof(std::size_t), mode::write);
1344  p[i] = i;
1345  c.checkin(p + i, sizeof(std::size_t), mode::write);
1346  }
1347 
1348  barrier();
1349 
1350  for (std::size_t i = (my_rank + 1) % n_ranks; i < n; i += n_ranks) {
1351  c.checkout(p + i, sizeof(std::size_t), mode::read_write);
1352  ITYR_CHECK(p[i] == i);
1353  p[i] *= 2;
1354  c.checkin(p + i, sizeof(std::size_t), mode::read_write);
1355  }
1356 
1357  barrier();
1358 
1359  for (std::size_t i = (my_rank + 2) % n_ranks; i < n; i += n_ranks) {
1360  if (i % 3 == 0) {
1361  c.checkout(p + i, sizeof(std::size_t), mode::write);
1362  p[i] = i * 10;
1363  c.checkin(p + i, sizeof(std::size_t), mode::write);
1364  } else {
1365  c.checkout(p + i, sizeof(std::size_t), mode::read);
1366  ITYR_CHECK(p[i] == i * 2);
1367  c.checkin(p + i, sizeof(std::size_t), mode::read);
1368  }
1369  }
1370 
1371  barrier();
1372 
1373  for (std::size_t i = (my_rank + 3) % n_ranks; i < n; i += n_ranks) {
1374  c.checkout(p + i, sizeof(std::size_t), mode::read);
1375  if (i % 3 == 0) {
1376  ITYR_CHECK(p[i] == i * 10);
1377  } else {
1378  ITYR_CHECK(p[i] == i * 2);
1379  }
1380  c.checkin(p + i, sizeof(std::size_t), mode::read);
1381  }
1382  }
1383 
1384  c.free_coll(ps[0]);
1385  c.free_coll(ps[1]);
1386 }
1387 
1388 ITYR_TEST_CASE("[ityr::ori::core] checkout/checkin (noncollective)") {
1389  common::runtime_options common_opts;
1390  runtime_options opts;
1391  common::singleton_initializer<common::topology::instance> topo;
1392  common::singleton_initializer<common::rma::instance> rma;
1393  constexpr block_size_t bs = 65536;
1394  int n_cb = 16;
1395  core<bs> c(n_cb * bs, bs / 4);
1396 
1400 
1401  ITYR_SUBCASE("list creation") {
1402  int niter = 1000;
1403  int n_alloc_iter = 10;
1404 
1405  struct node_t {
1406  node_t* next = nullptr;
1407  int value;
1408  };
1409 
1410  node_t* root_node = new (c.malloc(sizeof(node_t))) node_t;
1411 
1412  c.checkout(root_node, sizeof(node_t), mode::write);
1413  root_node->next = nullptr;
1414  root_node->value = my_rank;
1415  c.checkin(root_node, sizeof(node_t), mode::write);
1416 
1417  node_t* node = root_node;
1418  for (int i = 0; i < niter; i++) {
1419  for (int j = 0; j < n_alloc_iter; j++) {
1420  // append a new node
1421  node_t* new_node = new (c.malloc(sizeof(node_t))) node_t;
1422 
1423  c.checkout(&node->next, sizeof(node->next), mode::write);
1424  node->next = new_node;
1425  c.checkin(&node->next, sizeof(node->next), mode::write);
1426 
1427  c.checkout(&node->value, sizeof(node->value), mode::read);
1428  int val = node->value;
1429  c.checkin(&node->value, sizeof(node->value), mode::read);
1430 
1431  c.checkout(new_node, sizeof(node_t), mode::write);
1432  new_node->next = nullptr;
1433  new_node->value = val + 1;
1434  c.checkin(new_node, sizeof(node_t), mode::write);
1435 
1436  node = new_node;
1437  }
1438 
1439  c.release();
1440 
1441  // exchange nodes across nodes
1442  node_t* next_node;
1443 
1444  auto req_send = common::mpi_isend(&node , 1, (n_ranks + my_rank + 1) % n_ranks, i, mpicomm);
1445  auto req_recv = common::mpi_irecv(&next_node, 1, (n_ranks + my_rank - 1) % n_ranks, i, mpicomm);
1446  common::mpi_wait(req_send);
1447  common::mpi_wait(req_recv);
1448 
1449  node = next_node;
1450 
1451  c.acquire();
1452  }
1453 
1454  c.release();
1456  c.acquire();
1457 
1458  int count = 0;
1459  node = root_node;
1460  while (node != nullptr) {
1461  c.checkout(node, sizeof(node_t), mode::read);
1462 
1463  ITYR_CHECK(node->value == my_rank + count);
1464 
1465  node_t* prev_node = node;
1466  node = node->next;
1467 
1468  c.checkin(prev_node, sizeof(node_t), mode::read);
1469 
1470  std::destroy_at(prev_node);
1471  c.free(prev_node, sizeof(node_t));
1472 
1473  count++;
1474  }
1475  }
1476 }
1477 
1478 ITYR_TEST_CASE("[ityr::ori::core] release/acquire fence") {
1479  common::runtime_options common_opts;
1480  runtime_options opts;
1481  common::singleton_initializer<common::topology::instance> topo;
1482  common::singleton_initializer<common::rma::instance> rma;
1483  constexpr block_size_t bs = 65536;
1484  int n_cb = 16;
1485  core<bs> c(n_cb * bs, bs / 4);
1486 
1490 
1491  auto barrier = [&]() {
1492  c.release();
1494  c.acquire();
1495  };
1496 
1497  int* p = reinterpret_cast<int*>(c.malloc_coll(sizeof(int)));
1498 
1499  if (my_rank == 0) {
1500  c.checkout(p, sizeof(int), mode::write);
1501  p[0] = 3;
1502  c.checkin(p, sizeof(int), mode::write);
1503  }
1504 
1505  barrier();
1506 
1507  c.checkout(p, sizeof(int), mode::read);
1508  ITYR_CHECK(p[0] == 3);
1509  c.checkin(p, sizeof(int), mode::read);
1510 
1511  barrier();
1512 
1513  if (my_rank == (n_ranks + 1) % n_ranks) {
1514  c.checkout(p, sizeof(int), mode::read_write);
1515  p[0] += 5;
1516  c.checkin(p, sizeof(int), mode::read_write);
1517  }
1518 
1519  barrier();
1520 
1521  c.checkout(p, sizeof(int), mode::read);
1522  ITYR_CHECK(p[0] == 8);
1523  c.checkin(p, sizeof(int), mode::read);
1524 
1525  barrier();
1526 
1527  int n = 100;
1528  for (int i = 0; i < n; i++) {
1529  auto root_rank = (n_ranks + i) % n_ranks;
1530  if (my_rank == root_rank) {
1531  c.checkout(p, sizeof(int), mode::read_write);
1532  p[0] += 12;
1533  c.checkin(p, sizeof(int), mode::read_write);
1534  }
1535 
1537 
1538  if (my_rank == root_rank) {
1539  rh = c.release_lazy();
1540  }
1541 
1542  rh = common::mpi_bcast_value(rh, root_rank, mpicomm);
1543 
1544  if (my_rank != root_rank) {
1545  c.acquire(rh);
1546  }
1547 
1548  c.checkout(p, sizeof(int), mode::read);
1549  ITYR_CHECK(p[0] == 20 + 12 * i);
1550  c.checkin(p, sizeof(int), mode::read);
1551 
1552  auto req = common::mpi_ibarrier(mpicomm);
1553  while (!common::mpi_test(req)) {
1554  c.poll();
1555  }
1556  }
1557 
1558  c.free_coll(p);
1559 }
1560 
1561 }
Definition: util.hpp:207
Definition: util.hpp:176
void * addr() const
Definition: virtual_mem.hpp:46
std::conditional_t< enable_lazy_release, release_manager::release_handler, void * > release_handler
Definition: cache_manager.hpp:220
coll_mem & create(std::size_t size, std::unique_ptr< mem_mapper::base > mmapper)
Definition: coll_mem_manager.hpp:23
void destroy(coll_mem &cm)
Definition: coll_mem_manager.hpp:34
coll_mem & get(void *addr)
Definition: coll_mem_manager.hpp:14
Definition: coll_mem.hpp:16
std::size_t effective_size() const
Definition: coll_mem.hpp:36
const common::rma::win & win() const
Definition: coll_mem.hpp:51
const common::virtual_mem & home_vm() const
Definition: coll_mem.hpp:47
std::size_t size() const
Definition: coll_mem.hpp:34
const common::virtual_mem & vm() const
Definition: coll_mem.hpp:41
const mem_mapper::base & mem_mapper() const
Definition: coll_mem.hpp:39
Definition: core.hpp:62
void * malloc_coll(std::size_t size, MemMapperArgs &&... mmargs)
Definition: core.hpp:76
void poll()
Definition: core.hpp:297
void free_coll(void *addr)
Definition: core.hpp:105
void checkout(void *addr, std::size_t size, Mode mode)
Definition: core.hpp:221
void unset_readonly_coll(void *addr, std::size_t size)
Definition: core.hpp:289
void collect_deallocated()
Definition: core.hpp:301
void checkout_complete()
Definition: core.hpp:227
void cache_prof_begin()
Definition: core.hpp:305
void * get_local_mem(void *addr)
Definition: core.hpp:322
void checkin(void *addr, std::size_t size, Mode)
Definition: core.hpp:233
void set_readonly_coll(void *addr, std::size_t size)
Definition: core.hpp:280
void acquire(release_handler rh)
Definition: core.hpp:272
void cache_prof_print() const
Definition: core.hpp:315
void free(void *addr, std::size_t size)
Definition: core.hpp:133
void * malloc(std::size_t size)
Definition: core.hpp:94
void get(const void *from_addr, void *to_addr, std::size_t size)
Definition: core.hpp:160
void cache_prof_end()
Definition: core.hpp:310
void acquire()
Definition: core.hpp:264
void release()
Definition: core.hpp:248
core_default(std::size_t cache_size, std::size_t sub_block_size)
Definition: core.hpp:66
typename cache_manager< BlockSize >::release_handler release_handler
Definition: core.hpp:256
void * malloc_coll(std::size_t size)
Definition: core.hpp:73
void put(const void *from_addr, void *to_addr, std::size_t size)
Definition: core.hpp:182
release_handler release_lazy()
Definition: core.hpp:258
static constexpr block_size_t block_size
Definition: core.hpp:71
bool checkout_nb(void *addr, std::size_t size, Mode)
Definition: core.hpp:205
Definition: core.hpp:613
void unset_readonly_coll(void *, std::size_t)
Definition: core.hpp:730
void * malloc(std::size_t size)
Definition: core.hpp:641
void release()
Definition: core.hpp:719
void free(void *addr, std::size_t size)
Definition: core.hpp:666
void cache_prof_begin()
Definition: core.hpp:738
void * release_handler
Definition: core.hpp:721
void acquire()
Definition: core.hpp:725
void get(const void *from_addr, void *to_addr, std::size_t size)
Definition: core.hpp:686
static constexpr block_size_t block_size
Definition: core.hpp:618
void poll()
Definition: core.hpp:732
void collect_deallocated()
Definition: core.hpp:734
void checkout_complete()
Definition: core.hpp:710
void * malloc_coll(std::size_t size, MemMapperArgs &&... mmargs)
Definition: core.hpp:623
void set_readonly_coll(void *, std::size_t)
Definition: core.hpp:729
core_nocache(std::size_t, std::size_t)
Definition: core.hpp:615
void checkout(void *, std::size_t, Mode)
Definition: core.hpp:706
void checkin(void *, std::size_t, Mode)
Definition: core.hpp:715
void * get_local_mem(void *addr)
Definition: core.hpp:744
void * malloc_coll(std::size_t size)
Definition: core.hpp:620
void free_coll(void *addr)
Definition: core.hpp:652
void put(const void *from_addr, void *to_addr, std::size_t size)
Definition: core.hpp:693
void cache_prof_end()
Definition: core.hpp:739
void acquire(release_handler)
Definition: core.hpp:727
release_handler release_lazy()
Definition: core.hpp:723
void cache_prof_print() const
Definition: core.hpp:740
bool checkout_nb(void *, std::size_t, Mode)
Definition: core.hpp:701
Definition: core.hpp:880
void * malloc(std::size_t size)
Definition: core.hpp:893
void put(const void *from_addr, void *to_addr, std::size_t size)
Definition: core.hpp:909
void checkout(void *, std::size_t, Mode)
Definition: core.hpp:917
core_serial(std::size_t, std::size_t)
Definition: core.hpp:882
void unset_readonly_coll(void *, std::size_t)
Definition: core.hpp:935
void * malloc_coll(std::size_t size, MemMapperArgs &&...)
Definition: core.hpp:889
static constexpr block_size_t block_size
Definition: core.hpp:884
void acquire(release_handler)
Definition: core.hpp:932
void checkin(void *, std::size_t, Mode)
Definition: core.hpp:922
void * release_handler
Definition: core.hpp:926
void cache_prof_end()
Definition: core.hpp:942
void acquire()
Definition: core.hpp:930
release_handler release_lazy()
Definition: core.hpp:928
void get(const void *from_addr, void *to_addr, std::size_t size)
Definition: core.hpp:905
void cache_prof_print() const
Definition: core.hpp:943
void * malloc_coll(std::size_t size)
Definition: core.hpp:886
void * get_local_mem(void *addr)
Definition: core.hpp:947
void poll()
Definition: core.hpp:937
void cache_prof_begin()
Definition: core.hpp:941
void free(void *addr, std::size_t)
Definition: core.hpp:901
void release()
Definition: core.hpp:924
void collect_deallocated()
Definition: core.hpp:939
void checkout_complete()
Definition: core.hpp:919
bool checkout_nb(void *, std::size_t, Mode)
Definition: core.hpp:914
void free_coll(void *addr)
Definition: core.hpp:897
void set_readonly_coll(void *, std::size_t)
Definition: core.hpp:934
virtual std::size_t local_size(int inter_rank) const =0
Definition: mem_mapper.hpp:64
Definition: mem_mapper.hpp:165
const common::rma::win & win() const
Definition: noncoll_mem.hpp:66
std::size_t get_disp(const void *p) const
Definition: noncoll_mem.hpp:76
void remote_deallocate(void *p, std::size_t bytes[[maybe_unused]], int target_rank, std::size_t alignment=alignof(max_align_t))
Definition: noncoll_mem.hpp:150
void local_deallocate(void *p, std::size_t bytes, std::size_t alignment=alignof(max_align_t))
Definition: noncoll_mem.hpp:134
common::topology::rank_t get_owner(const void *p) const
Definition: noncoll_mem.hpp:72
bool has(const void *p) const
Definition: noncoll_mem.hpp:68
void collect_deallocated()
Definition: noncoll_mem.hpp:168
#define ITYR_CONCAT(x, y)
Definition: util.hpp:20
#define ITYR_CHECK_MESSAGE(cond,...)
Definition: util.hpp:49
#define ITYR_SUBCASE(name)
Definition: util.hpp:41
#define ITYR_CHECK(cond)
Definition: util.hpp:48
#define ITYR_REQUIRE_MESSAGE(cond, msg,...)
Definition: util.hpp:43
#define ITYR_REQUIRE(cond)
Definition: util.hpp:42
int node_t
Definition: numa.hpp:76
ITYR_CONCAT(mode_, ITYR_PROFILER_MODE) mode
Definition: profiler.hpp:257
void get_nb(const win &origin_win, T *origin_addr, std::size_t count, const win &target_win, int target_rank, std::size_t target_disp)
Definition: rma.hpp:25
void flush(const win &target_win)
Definition: rma.hpp:76
void put_nb(const win &origin_win, const T *origin_addr, std::size_t count, const win &target_win, int target_rank, std::size_t target_disp)
Definition: rma.hpp:51
rank_t inter_my_rank()
Definition: topology.hpp:215
rank_t n_ranks()
Definition: topology.hpp:208
int rank_t
Definition: topology.hpp:12
MPI_Comm mpicomm()
Definition: topology.hpp:206
rank_t inter_n_ranks()
Definition: topology.hpp:216
bool is_locally_accessible(rank_t target_global_rank)
Definition: topology.hpp:224
rank_t inter2global_rank(rank_t inter_rank)
Definition: topology.hpp:222
rank_t intra_n_ranks()
Definition: topology.hpp:212
rank_t my_rank()
Definition: topology.hpp:207
T mpi_bcast_value(const T &value, int root_rank, MPI_Comm comm)
Definition: mpi_util.hpp:145
T round_down_pow2(T x, T alignment)
Definition: util.hpp:130
void mpi_wait(MPI_Request &req)
Definition: mpi_util.hpp:250
MPI_Request mpi_isend(const T *buf, std::size_t count, int target_rank, int tag, MPI_Comm comm)
Definition: mpi_util.hpp:67
MPI_Request mpi_ibarrier(MPI_Comm comm)
Definition: mpi_util.hpp:46
void mpi_barrier(MPI_Comm comm)
Definition: mpi_util.hpp:42
MPI_Request mpi_irecv(T *buf, std::size_t count, int target_rank, int tag, MPI_Comm comm)
Definition: mpi_util.hpp:107
bool mpi_test(MPI_Request &req)
Definition: mpi_util.hpp:254
void verbose(const char *fmt,...)
Definition: logger.hpp:11
Definition: core.hpp:21
void for_each_seg_blk(const coll_mem &cm, void *addr, std::size_t size, HomeSegFn home_seg_fn, CacheBlkFn cache_blk_fn)
Definition: core.hpp:36
ITYR_CONCAT(core_, ITYR_ORI_CORE)< BlockSize > core
Definition: core.hpp:951
void for_each_block(void *addr, std::size_t size, Fn fn)
Definition: core.hpp:24
constexpr read_write_t read_write
Definition: util.hpp:15
constexpr read_t read
Definition: util.hpp:11
constexpr write_t write
Definition: util.hpp:13
void for_each_mem_segment(const coll_mem &cm, const void *addr, std::size_t size, Fn fn)
Definition: coll_mem.hpp:141
void free(global_ptr< T > ptr, std::size_t count)
Definition: ori.hpp:75
std::string str(mode::read_t)
Definition: util.hpp:18
std::size_t sys_mmap_entry_limit()
Definition: util.hpp:32
core::instance::instance_type::release_handler release_handler
Definition: ori.hpp:204
global_ptr< T > malloc(std::size_t count)
Definition: ori.hpp:65
uint32_t block_size_t
Definition: util.hpp:30
monoid< T, min_functor<>, highest< T > > min
Definition: reducer.hpp:101
monoid< T, max_functor<>, lowest< T > > max
Definition: reducer.hpp:104
rank_t my_rank()
Return the rank of the process running the current thread.
Definition: ityr.hpp:99
rank_t n_ranks()
Return the total number of processes.
Definition: ityr.hpp:107
constexpr auto size(const checkout_span< T, Mode > &cs) noexcept
Definition: checkout_span.hpp:178
void barrier()
Barrier for all processes (collective).
Definition: ityr.hpp:150
ForwardIteratorD move(const ExecutionPolicy &policy, ForwardIterator1 first1, ForwardIterator1 last1, ForwardIteratorD first_d)
Move a range to another.
Definition: parallel_loop.hpp:934
#define ITYR_ORI_CORE
#define ITYR_ORI_ENABLE_VM_MAP
#define ITYR_PROFILER_RECORD(event,...)
Definition: profiler.hpp:319
Definition: options.hpp:153
Definition: prof_events.hpp:31
Definition: prof_events.hpp:26
Definition: prof_events.hpp:21
Definition: prof_events.hpp:11
Definition: prof_events.hpp:16
Definition: options.hpp:96