23 template <block_
size_t BlockSize,
typename Fn>
26 std::byte* blk_addr_e =
reinterpret_cast<std::byte*
>(addr) +
size;
28 for (std::byte* blk_addr = blk_addr_b; blk_addr < blk_addr_e; blk_addr += BlockSize) {
29 std::byte* req_addr_b =
std::max(
reinterpret_cast<std::byte*
>(addr), blk_addr);
30 std::byte* req_addr_e =
std::min(
reinterpret_cast<std::byte*
>(addr) +
size, blk_addr + BlockSize);
31 fn(blk_addr, req_addr_b, req_addr_e);
35 template <block_
size_t BlockSize,
typename HomeSegFn,
typename CacheBlkFn>
37 HomeSegFn home_seg_fn, CacheBlkFn cache_blk_fn) {
39 std::byte* seg_addr =
reinterpret_cast<std::byte*
>(cm.
vm().addr()) + seg.offset_b;
40 std::size_t seg_size = seg.offset_e - seg.offset_b;
44 home_seg_fn(seg_addr, seg_size, seg.pm_offset);
48 std::byte* addr_b =
std::max(seg_addr,
reinterpret_cast<std::byte*
>(addr));
49 std::byte* addr_e =
std::min(seg_addr + seg_size,
reinterpret_cast<std::byte*
>(addr) +
size);
50 for_each_block<BlockSize>(addr_b, addr_e - addr_b, [&](std::byte* blk_addr,
51 std::byte* req_addr_b,
52 std::byte* req_addr_e) {
53 std::size_t pm_offset = seg.pm_offset + (blk_addr - seg_addr);
55 cache_blk_fn(blk_addr, req_addr_b, req_addr_e, seg.owner, pm_offset);
61 template <block_
size_t BlockSize>
68 home_manager_(calc_home_mmap_limit(cache_size / BlockSize)),
69 cache_manager_(cache_size, sub_block_size) {}
75 template <
template <block_
size_t>
typename MemMapper,
typename... MemMapperArgs>
79 "The size passed to malloc_coll() is different among workers");
81 auto mmapper = std::make_unique<MemMapper<BlockSize>>(
size,
84 std::forward<MemMapperArgs>(mmargs)...);
86 void* addr = cm.
vm().
addr();
88 common::verbose(
"Allocate collective memory [%p, %p) (%ld bytes) (win=%p)",
89 addr,
reinterpret_cast<std::byte*
>(addr) +
size,
size, &cm.
win());
97 void* addr = noncoll_mem_.allocate(
size);
99 common::verbose<2>(
"Allocate noncollective memory [%p, %p) (%ld bytes)",
100 addr,
reinterpret_cast<std::byte*
>(addr) +
size,
size);
108 "The address passed to free_coll() is different among workers");
111 cache_manager_.ensure_all_cache_clean();
117 for (std::size_t o = 0; o < cm.
effective_size(); o += BlockSize) {
118 std::byte* addr =
reinterpret_cast<std::byte*
>(cm.
vm().
addr()) + o;
119 home_manager_.ensure_evicted(addr);
120 cache_manager_.ensure_evicted(addr);
123 home_manager_.clear_tlb();
124 cache_manager_.clear_tlb();
126 common::verbose(
"Deallocate collective memory [%p, %p) (%ld bytes) (win=%p)",
127 addr,
reinterpret_cast<std::byte*
>(addr) + cm.
size(), cm.
size(), &cm.
win());
142 common::verbose<2>(
"Deallocate noncollective memory [%p, %p) (%ld bytes) locally",
143 addr,
reinterpret_cast<std::byte*
>(addr) +
size,
size);
147 for_each_block<BlockSize>(addr,
size, [&](std::byte* blk_addr,
148 std::byte* req_addr_b,
149 std::byte* req_addr_e) {
150 cache_manager_.discard_dirty(blk_addr, req_addr_b, req_addr_e);
155 common::verbose<2>(
"Deallocate noncollective memory [%p, %p) (%ld bytes) remotely (rank=%d)",
156 addr,
reinterpret_cast<std::byte*
>(addr) +
size,
size, target_rank);
160 void get(
const void* from_addr,
void* to_addr, std::size_t
size) {
163 std::byte* from_addr_ =
reinterpret_cast<std::byte*
>(
const_cast<void*
>(from_addr));
169 if (!checkout_impl_nb<mode::read_t, false>(from_addr_,
size)) {
170 checkout_complete_impl();
172 get_copy_impl(from_addr_,
reinterpret_cast<std::byte*
>(to_addr),
size);
174 if (!checkout_impl_nb<mode::read_t, true>(from_addr_,
size)) {
175 checkout_complete_impl();
177 get_copy_impl(from_addr_,
reinterpret_cast<std::byte*
>(to_addr),
size);
178 checkin_impl<mode::read_t, true>(from_addr_,
size);
182 void put(
const void* from_addr,
void* to_addr, std::size_t
size) {
185 std::byte* to_addr_ =
reinterpret_cast<std::byte*
>(to_addr);
190 if (!checkout_impl_nb<mode::write_t, false>(to_addr_,
size)) {
191 checkout_complete_impl();
193 put_copy_impl(
reinterpret_cast<const std::byte*
>(from_addr), to_addr_,
size);
194 checkin_impl<mode::write_t, false>(to_addr_,
size);
196 if (!checkout_impl_nb<mode::write_t, true>(to_addr_,
size)) {
197 checkout_complete_impl();
199 put_copy_impl(
reinterpret_cast<const std::byte*
>(from_addr), to_addr_,
size);
200 checkin_impl<mode::write_t, true>(to_addr_,
size);
204 template <
typename Mode>
206 if constexpr (!enable_vm_map) {
207 common::die(
"ITYR_ORI_ENABLE_VM_MAP must be true for core::checkout/checkin");
211 common::verbose<2>(
"Checkout request (mode: %s) for [%p, %p) (%ld bytes)",
212 str(Mode{}).c_str(), addr,
reinterpret_cast<std::byte*
>(addr) +
size,
size);
217 return checkout_impl_nb<Mode, true>(
reinterpret_cast<std::byte*
>(addr),
size);
220 template <
typename Mode>
229 checkout_complete_impl();
232 template <
typename Mode>
234 if constexpr (!enable_vm_map) {
235 common::die(
"ITYR_ORI_ENABLE_VM_MAP must be true for core::checkout/checkin");
239 common::verbose<2>(
"Checkin request (mode: %s) for [%p, %p) (%ld bytes)",
240 str(Mode{}).c_str(), addr,
reinterpret_cast<std::byte*
>(addr) +
size,
size);
245 checkin_impl<Mode, true>(
reinterpret_cast<std::byte*
>(addr),
size);
251 cache_manager_.release();
259 common::verbose<2>(
"Lazy release handler is created");
261 return cache_manager_.release_lazy();
267 cache_manager_.acquire();
275 cache_manager_.acquire(rh);
284 cache_manager_.set_readonly(addr,
size);
292 cache_manager_.unset_readonly(addr,
size);
298 cache_manager_.poll();
306 home_manager_.home_prof_begin();
307 cache_manager_.cache_prof_begin();
311 home_manager_.home_prof_end();
312 cache_manager_.cache_prof_end();
316 home_manager_.home_prof_print();
317 cache_manager_.cache_prof_print();
328 std::size_t calc_home_mmap_limit(std::size_t n_cache_blocks)
const {
330 std::size_t margin = 1000;
331 ITYR_CHECK(sys_limit > 2 * n_cache_blocks + margin);
333 std::size_t candidate = (sys_limit - 2 * n_cache_blocks - margin) / 2;
334 std::size_t max_val = 1024 * 1024;
336 return std::min(max_val, candidate);
339 template <
typename Mode,
bool IncrementRef>
340 bool checkout_impl_nb(std::byte* addr, std::size_t
size) {
341 constexpr
bool skip_fetch = std::is_same_v<Mode, mode::write_t>;
342 if (noncoll_mem_.
has(addr)) {
343 return checkout_noncoll_nb<skip_fetch, IncrementRef>(addr,
size);
345 return checkout_coll_nb<skip_fetch, IncrementRef>(addr,
size);
349 template <
bool SkipFetch,
bool IncrementRef>
350 bool checkout_coll_nb(std::byte* addr, std::size_t
size) {
351 if (home_manager_.template checkout_fast<IncrementRef>(addr,
size)) {
355 auto [entry_found, fetch_completed] =
356 cache_manager_.template checkout_fast<SkipFetch, IncrementRef>(addr,
size);
358 return fetch_completed;
361 coll_mem& cm = cm_manager_.
get(addr);
363 bool checkout_completed =
true;
365 for_each_seg_blk<BlockSize>(cm, addr,
size,
367 [&](std::byte* seg_addr, std::size_t seg_size, std::size_t pm_offset) {
368 checkout_completed &=
369 home_manager_.template checkout_seg<IncrementRef>(
370 seg_addr, seg_size, addr,
size,
371 cm.home_pm(), pm_offset, cm.home_all_mapped());
374 [&](std::byte* blk_addr, std::byte* req_addr_b, std::byte* req_addr_e,
376 checkout_completed &=
377 cache_manager_.template checkout_blk<SkipFetch, IncrementRef>(
378 blk_addr, req_addr_b, req_addr_e,
382 return checkout_completed;
385 template <
bool SkipFetch,
bool IncrementRef>
386 bool checkout_noncoll_nb(std::byte* addr, std::size_t
size) {
389 auto target_rank = noncoll_mem_.
get_owner(addr);
396 home_manager_.on_checkout_noncoll(
size);
400 auto [entry_found, fetch_completed] =
401 cache_manager_.template checkout_fast<SkipFetch, IncrementRef>(addr,
size);
403 return fetch_completed;
406 bool checkout_completed =
true;
408 for_each_block<BlockSize>(addr,
size, [&](std::byte* blk_addr,
409 std::byte* req_addr_b,
410 std::byte* req_addr_e) {
411 checkout_completed &=
412 cache_manager_.template checkout_blk<SkipFetch, IncrementRef>(
413 blk_addr, req_addr_b, req_addr_e,
419 return checkout_completed;
422 template <
typename Mode,
bool DecrementRef>
423 void checkin_impl(std::byte* addr, std::size_t
size) {
424 constexpr
bool register_dirty = !std::is_same_v<Mode, mode::read_t>;
425 if (noncoll_mem_.
has(addr)) {
426 checkin_noncoll<register_dirty, DecrementRef>(addr,
size);
428 checkin_coll<register_dirty, DecrementRef>(addr,
size);
432 void checkout_complete_impl() {
433 home_manager_.checkout_complete();
434 cache_manager_.checkout_complete();
437 template <
bool RegisterDirty,
bool DecrementRef>
438 void checkin_coll(std::byte* addr, std::size_t
size) {
439 if (home_manager_.template checkin_fast<DecrementRef>(addr,
size)) {
443 if (cache_manager_.template checkin_fast<RegisterDirty, DecrementRef>(addr,
size)) {
447 coll_mem& cm = cm_manager_.
get(addr);
449 for_each_seg_blk<BlockSize>(cm, addr,
size,
451 [&](std::byte* seg_addr, std::size_t, std::size_t) {
452 home_manager_.template checkin_seg<DecrementRef>(seg_addr, cm.home_all_mapped());
455 [&](std::byte* blk_addr, std::byte* req_addr_b, std::byte* req_addr_e,
457 cache_manager_.template checkin_blk<RegisterDirty, DecrementRef>(
458 blk_addr, req_addr_b, req_addr_e);
462 template <
bool RegisterDirty,
bool DecrementRef>
463 void checkin_noncoll(std::byte* addr, std::size_t
size) {
466 auto target_rank = noncoll_mem_.
get_owner(addr);
476 if (cache_manager_.template checkin_fast<RegisterDirty, DecrementRef>(addr,
size)) {
480 for_each_block<BlockSize>(addr,
size, [&](std::byte* blk_addr,
481 std::byte* req_addr_b,
482 std::byte* req_addr_e) {
483 cache_manager_.template checkin_blk<RegisterDirty, DecrementRef>(
484 blk_addr, req_addr_b, req_addr_e);
495 void get_copy_impl(std::byte* from_addr, std::byte* to_addr, std::size_t
size) {
496 if constexpr (enable_vm_map) {
497 std::memcpy(to_addr, from_addr,
size);
498 }
else if (noncoll_mem_.
has(from_addr)) {
499 get_copy_noncoll(from_addr, to_addr,
size);
501 get_copy_coll(from_addr, to_addr,
size);
505 void get_copy_coll(std::byte* from_addr, std::byte* to_addr, std::size_t
size) {
508 coll_mem& cm = cm_manager_.
get(from_addr);
510 for_each_seg_blk<BlockSize>(cm, from_addr,
size,
512 [&](std::byte* seg_addr, std::size_t seg_size, std::size_t pm_offset) {
513 const common::virtual_mem& vm = cm.home_vm();
514 std::byte* seg_addr_b =
std::max(from_addr, seg_addr);
515 std::byte* seg_addr_e =
std::min(seg_addr + seg_size, from_addr +
size);
516 std::size_t seg_offset = seg_addr_b - seg_addr;
517 std::byte* from_addr_ =
reinterpret_cast<std::byte*
>(vm.addr()) + pm_offset + seg_offset;
518 std::byte* to_addr_ = to_addr + (seg_addr_b - from_addr);
519 std::memcpy(to_addr_, from_addr_, seg_addr_e - seg_addr_b);
522 [&](std::byte* blk_addr, std::byte* req_addr_b, std::byte* req_addr_e,
524 cache_manager_.get_copy_blk(blk_addr, req_addr_b, req_addr_e, to_addr + (req_addr_b - from_addr));
528 void get_copy_noncoll(std::byte* from_addr, std::byte* to_addr, std::size_t
size) {
533 auto target_rank = noncoll_mem_.
get_owner(from_addr);
538 std::memcpy(to_addr, from_addr,
size);
542 for_each_block<BlockSize>(from_addr,
size, [&](std::byte* blk_addr,
543 std::byte* req_addr_b,
544 std::byte* req_addr_e) {
545 cache_manager_.get_copy_blk(blk_addr, req_addr_b, req_addr_e, to_addr + (req_addr_b - from_addr));
549 void put_copy_impl(
const std::byte* from_addr, std::byte* to_addr, std::size_t
size) {
550 if constexpr (enable_vm_map) {
551 std::memcpy(to_addr, from_addr,
size);
552 }
else if (noncoll_mem_.
has(to_addr)) {
553 put_copy_noncoll(from_addr, to_addr,
size);
555 put_copy_coll(from_addr, to_addr,
size);
559 void put_copy_coll(
const std::byte* from_addr, std::byte* to_addr, std::size_t
size) {
562 coll_mem& cm = cm_manager_.
get(to_addr);
564 for_each_seg_blk<BlockSize>(cm, to_addr,
size,
566 [&](std::byte* seg_addr, std::size_t seg_size, std::size_t pm_offset) {
567 const common::virtual_mem& vm = cm.home_vm();
568 std::byte* seg_addr_b =
std::max(to_addr, seg_addr);
569 std::byte* seg_addr_e =
std::min(seg_addr + seg_size, to_addr +
size);
570 std::size_t seg_offset = seg_addr_b - seg_addr;
571 const std::byte* from_addr_ = from_addr + (seg_addr_b - to_addr);
572 std::byte* to_addr_ =
reinterpret_cast<std::byte*
>(vm.addr()) + pm_offset + seg_offset;
573 std::memcpy(to_addr_, from_addr_, seg_addr_e - seg_addr_b);
576 [&](std::byte* blk_addr, std::byte* req_addr_b, std::byte* req_addr_e,
578 cache_manager_.put_copy_blk(blk_addr, req_addr_b, req_addr_e, from_addr + (req_addr_b - to_addr));
582 void put_copy_noncoll(
const std::byte* from_addr, std::byte* to_addr, std::size_t
size) {
587 auto target_rank = noncoll_mem_.
get_owner(to_addr);
592 std::memcpy(to_addr, from_addr,
size);
596 for_each_block<BlockSize>(to_addr,
size, [&](std::byte* blk_addr,
597 std::byte* req_addr_b,
598 std::byte* req_addr_e) {
599 cache_manager_.put_copy_blk(blk_addr, req_addr_b, req_addr_e, from_addr + (req_addr_b - to_addr));
603 template <block_
size_t BS>
604 using default_mem_mapper = mem_mapper::ITYR_ORI_DEFAULT_MEM_MAPPER<BS>;
606 coll_mem_manager cm_manager_;
607 noncoll_mem noncoll_mem_;
608 home_manager<BlockSize> home_manager_;
609 cache_manager<BlockSize> cache_manager_;
612 template <block_
size_t BlockSize>
622 template <
template <block_
size_t>
typename MemMapper,
typename... MemMapperArgs>
625 common::die(
"Memory allocation size cannot be 0");
628 auto mmapper = std::make_unique<MemMapper<BlockSize>>(
size,
631 std::forward<MemMapperArgs>(mmargs)...);
633 void* addr = cm.
vm().
addr();
635 common::verbose(
"Allocate collective memory [%p, %p) (%ld bytes) (win=%p)",
636 addr,
reinterpret_cast<std::byte*
>(addr) +
size,
size, &cm.
win());
644 void* addr = noncoll_mem_.allocate(
size);
646 common::verbose<2>(
"Allocate noncollective memory [%p, %p) (%ld bytes)",
647 addr,
reinterpret_cast<std::byte*
>(addr) +
size,
size);
654 common::die(
"Null pointer was passed to free_coll()");
660 common::verbose(
"Deallocate collective memory [%p, %p) (%ld bytes) (win=%p)",
661 addr,
reinterpret_cast<std::byte*
>(addr) + cm.
size(), cm.
size(), &cm.
win());
675 common::verbose<2>(
"Deallocate noncollective memory [%p, %p) (%ld bytes) locally",
676 addr,
reinterpret_cast<std::byte*
>(addr) +
size,
size);
681 common::verbose<2>(
"Deallocate noncollective memory [%p, %p) (%ld bytes) remotely (rank=%d)",
682 addr,
reinterpret_cast<std::byte*
>(addr) +
size,
size, target_rank);
686 void get(
const void* from_addr,
void* to_addr, std::size_t
size) {
689 std::byte* from_addr_ =
reinterpret_cast<std::byte*
>(
const_cast<void*
>(from_addr));
690 get_impl(from_addr_,
reinterpret_cast<std::byte*
>(to_addr),
size);
693 void put(
const void* from_addr,
void* to_addr, std::size_t
size) {
696 std::byte* to_addr_ =
reinterpret_cast<std::byte*
>(to_addr);
697 put_impl(
reinterpret_cast<const std::byte*
>(from_addr), to_addr_,
size);
700 template <
typename Mode>
702 common::die(
"core::checkout/checkin is disabled");
705 template <
typename Mode>
707 common::die(
"core::checkout/checkin is disabled");
711 common::die(
"core::checkout/checkin is disabled");
714 template <
typename Mode>
716 common::die(
"core::checkout/checkin is disabled");
750 void get_impl(std::byte* from_addr, std::byte* to_addr, std::size_t
size) {
751 if (noncoll_mem_.
has(from_addr)) {
752 get_noncoll(from_addr, to_addr,
size);
754 get_coll(from_addr, to_addr,
size);
758 void get_coll(std::byte* from_addr, std::byte* to_addr, std::size_t
size) {
759 coll_mem& cm = cm_manager_.
get(from_addr);
761 bool fetching =
false;
763 for_each_seg_blk<BlockSize>(cm, from_addr,
size,
765 [&](std::byte* seg_addr, std::size_t seg_size, std::size_t pm_offset) {
766 const common::virtual_mem& vm = cm.home_vm();
767 std::byte* seg_addr_b =
std::max(from_addr, seg_addr);
768 std::byte* seg_addr_e =
std::min(seg_addr + seg_size, from_addr +
size);
769 std::size_t seg_offset = seg_addr_b - seg_addr;
770 std::byte* from_addr_ =
reinterpret_cast<std::byte*
>(vm.addr()) + pm_offset + seg_offset;
771 std::byte* to_addr_ = to_addr + (seg_addr_b - from_addr);
772 std::memcpy(to_addr_, from_addr_, seg_addr_e - seg_addr_b);
775 [&](std::byte* blk_addr, std::byte* req_addr_b, std::byte* req_addr_e,
779 pm_offset + (req_addr_b - blk_addr));
788 void get_noncoll(std::byte* from_addr, std::byte* to_addr, std::size_t
size) {
791 auto target_rank = noncoll_mem_.
get_owner(from_addr);
796 std::memcpy(to_addr, from_addr,
size);
800 for_each_block<BlockSize>(from_addr,
size, [&](std::byte* blk_addr,
801 std::byte* req_addr_b,
802 std::byte* req_addr_e) {
804 noncoll_mem_.
win(), target_rank,
805 noncoll_mem_.
get_disp(blk_addr) + (req_addr_b - blk_addr));
811 void put_impl(
const std::byte* from_addr, std::byte* to_addr, std::size_t
size) {
812 if (noncoll_mem_.
has(to_addr)) {
813 put_noncoll(from_addr, to_addr,
size);
815 put_coll(from_addr, to_addr,
size);
819 void put_coll(
const std::byte* from_addr, std::byte* to_addr, std::size_t
size) {
820 coll_mem& cm = cm_manager_.
get(to_addr);
822 bool putting =
false;
824 for_each_seg_blk<BlockSize>(cm, to_addr,
size,
826 [&](std::byte* seg_addr, std::size_t seg_size, std::size_t pm_offset) {
827 const common::virtual_mem& vm = cm.home_vm();
828 std::byte* seg_addr_b =
std::max(to_addr, seg_addr);
829 std::byte* seg_addr_e =
std::min(seg_addr + seg_size, to_addr +
size);
830 std::size_t seg_offset = seg_addr_b - seg_addr;
831 const std::byte* from_addr_ = from_addr + (seg_addr_b - to_addr);
832 std::byte* to_addr_ =
reinterpret_cast<std::byte*
>(vm.addr()) + pm_offset + seg_offset;
833 std::memcpy(to_addr_, from_addr_, seg_addr_e - seg_addr_b);
836 [&](std::byte* blk_addr, std::byte* req_addr_b, std::byte* req_addr_e,
840 pm_offset + (req_addr_b - blk_addr));
849 void put_noncoll(
const std::byte* from_addr, std::byte* to_addr, std::size_t
size) {
852 auto target_rank = noncoll_mem_.
get_owner(to_addr);
857 std::memcpy(to_addr, from_addr,
size);
861 for_each_block<BlockSize>(to_addr,
size, [&](std::byte* blk_addr,
862 std::byte* req_addr_b,
863 std::byte* req_addr_e) {
865 noncoll_mem_.
win(), target_rank,
866 noncoll_mem_.
get_disp(blk_addr) + (req_addr_b - blk_addr));
872 template <block_
size_t BS>
873 using default_mem_mapper = mem_mapper::ITYR_ORI_DEFAULT_MEM_MAPPER<BS>;
875 coll_mem_manager cm_manager_;
876 noncoll_mem noncoll_mem_;
879 template <block_
size_t BlockSize>
888 template <
template <block_
size_t>
typename MemMapper,
typename... MemMapperArgs>
901 void free(
void* addr, std::size_t) {
905 void get(
const void* from_addr,
void* to_addr, std::size_t
size) {
906 std::memcpy(to_addr, from_addr,
size);
909 void put(
const void* from_addr,
void* to_addr, std::size_t
size) {
910 std::memcpy(to_addr, from_addr,
size);
913 template <
typename Mode>
916 template <
typename Mode>
921 template <
typename Mode>
950 template <block_
size_t BlockSize>
955 ITYR_TEST_CASE(
"[ityr::ori::core] malloc/free with block policy") {
965 for (
int i = 1; i < n; i++) {
974 for (
int i = 1; i < n; i++) {
977 for (
int i = 1; i < n; i++) {
978 c.free_coll(ptrs[i]);
983 ITYR_TEST_CASE(
"[ityr::ori::core] malloc/free with cyclic policy") {
989 core<bs> c(16 * bs, bs / 4);
993 for (
int i = 1; i < n; i++) {
1002 for (
int i = 1; i < n; i++) {
1003 ptrs[i] = c.malloc_coll<mem_mapper::cyclic>(i * 27438, bs * i);
1005 for (
int i = 1; i < n; i++) {
1006 c.free_coll(ptrs[i]);
1011 ITYR_TEST_CASE(
"[ityr::ori::core] malloc and free (noncollective)") {
1012 common::runtime_options common_opts;
1013 runtime_options opts;
1014 common::singleton_initializer<common::topology::instance> topo;
1015 common::singleton_initializer<common::rma::instance> rma;
1017 core<bs> c(16 * bs, bs / 4);
1019 constexpr
int n = 10;
1021 for (
int i = 0; i < n; i++) {
1022 void* p = c.malloc(std::size_t(1) << i);
1023 c.free(p, std::size_t(1) << i);
1029 for (
int i = 0; i < n; i++) {
1030 ptrs[i] = c.malloc(std::size_t(1) << i);
1032 for (
int i = 0; i < n; i++) {
1033 c.free(ptrs[i], std::size_t(1) << i);
1040 for (
int i = 0; i < n; i++) {
1041 ptrs_send[i] = c.malloc(std::size_t(1) << i);
1053 for (
int i = 0; i < n; i++) {
1054 c.free(ptrs_recv[i], std::size_t(1) << i);
1059 ITYR_TEST_CASE(
"[ityr::ori::core] get/put") {
1060 common::runtime_options common_opts;
1061 runtime_options opts;
1062 common::singleton_initializer<common::topology::instance> topo;
1063 common::singleton_initializer<common::rma::instance> rma;
1066 core<bs> c(n_cb * bs, bs / 4);
1070 std::size_t n = n_cb * bs /
sizeof(std::size_t);
1073 ps[0] =
reinterpret_cast<std::size_t*
>(c.malloc_coll<mem_mapper::block >(n *
sizeof(std::size_t)));
1074 ps[1] =
reinterpret_cast<std::size_t*
>(c.malloc_coll<mem_mapper::cyclic>(n *
sizeof(std::size_t)));
1076 std::size_t* buf =
new std::size_t[n + 2];
1086 for (std::size_t i = 0; i < n; i++) {
1089 c.put(buf, p, n *
sizeof(std::size_t));
1095 std::size_t special = 417;
1096 buf[0] = buf[n + 1] = special;
1098 c.get(p, buf + 1, n *
sizeof(std::size_t));
1100 for (std::size_t i = 0; i < n; i++) {
1108 std::size_t ib = n / 5 * 2;
1109 std::size_t ie = n / 5 * 4;
1110 std::size_t s = ie - ib;
1112 std::size_t special = 417;
1113 buf[0] = buf[s + 1] = special;
1115 c.get(p + ib, buf + 1, s *
sizeof(std::size_t));
1117 for (std::size_t i = 0; i < s; i++) {
1125 for (std::size_t i = 0; i < n; i++) {
1126 std::size_t special = 417;
1127 buf[0] = buf[2] = special;
1128 c.get(p + i, &buf[1],
sizeof(std::size_t));
1142 ITYR_TEST_CASE(
"[ityr::ori::core] checkout/checkin (small, aligned)") {
1143 common::runtime_options common_opts;
1144 runtime_options opts;
1145 common::singleton_initializer<common::topology::instance> topo;
1146 common::singleton_initializer<common::rma::instance> rma;
1149 core<bs> c(n_cb * bs, bs / 4);
1156 ps[0] =
reinterpret_cast<uint8_t*
>(c.malloc_coll<mem_mapper::block >(n));
1157 ps[1] =
reinterpret_cast<uint8_t*
>(c.malloc_coll<mem_mapper::cyclic>(n));
1166 uint8_t* home_ptr =
reinterpret_cast<uint8_t*
>(c.get_local_mem(p));
1167 for (std::size_t i = 0; i < bs; i++) {
1175 for (
int i = 0; i < n; i++) {
1182 for (
int iter = 0; iter <
n_ranks; iter++) {
1185 for (
int i = 0; i < n; i++) {
1195 for (
int i = 0; i < n; i++) {
1210 for (
int i = 0; i < s; i++) {
1221 ITYR_TEST_CASE(
"[ityr::ori::core] checkout/checkin (large, not aligned)") {
1222 common::runtime_options common_opts;
1223 runtime_options opts;
1224 common::singleton_initializer<common::topology::instance> topo;
1225 common::singleton_initializer<common::rma::instance> rma;
1228 core<bs> c(n_cb * bs, bs / 4);
1233 std::size_t n = 10 * n_cb * bs /
sizeof(std::size_t);
1236 ps[0] =
reinterpret_cast<std::size_t*
>(c.malloc_coll<mem_mapper::block >(n *
sizeof(std::size_t)));
1237 ps[1] =
reinterpret_cast<std::size_t*
>(c.malloc_coll<mem_mapper::cyclic>(n *
sizeof(std::size_t)));
1239 std::size_t max_checkout_size = (16 - 2) * bs /
sizeof(std::size_t);
1249 for (std::size_t i = 0; i < n; i += max_checkout_size) {
1250 std::size_t m =
std::min(max_checkout_size, n - i);
1251 c.checkout(p + i, m *
sizeof(std::size_t),
mode::write);
1252 for (std::size_t j = i; j < i + m; j++) {
1255 c.checkin(p + i, m *
sizeof(std::size_t),
mode::write);
1262 for (std::size_t i = 0; i < n; i += max_checkout_size) {
1263 std::size_t m =
std::min(max_checkout_size, n - i);
1264 c.checkout(p + i, m *
sizeof(std::size_t),
mode::read);
1265 for (std::size_t j = i; j < i + m; j++) {
1268 c.checkin(p + i, m *
sizeof(std::size_t),
mode::read);
1273 std::size_t ib = n / 5 * 2;
1274 std::size_t ie = n / 5 * 4;
1275 std::size_t s = ie - ib;
1277 for (std::size_t i = 0; i < s; i += max_checkout_size) {
1278 std::size_t m =
std::min(max_checkout_size, s - i);
1279 c.checkout(p + ib + i, m *
sizeof(std::size_t),
mode::read);
1280 for (std::size_t j = ib + i; j < ib + i + m; j++) {
1283 c.checkin(p + ib + i, m *
sizeof(std::size_t),
mode::read);
1288 std::size_t stride = 48;
1290 for (std::size_t i =
my_rank * stride; i < n; i +=
n_ranks * stride) {
1291 std::size_t s =
std::min(stride, n - i);
1293 for (std::size_t j = i; j < i + s; j++) {
1302 for (std::size_t i = 0; i < n; i += max_checkout_size) {
1303 std::size_t m =
std::min(max_checkout_size, n - i);
1304 c.checkout(p + i, m *
sizeof(std::size_t),
mode::read);
1305 for (std::size_t j = i; j < i + m; j++) {
1308 c.checkin(p + i, m *
sizeof(std::size_t),
mode::read);
1317 ITYR_TEST_CASE(
"[ityr::ori::core] checkout/checkin (noncontig)") {
1318 common::runtime_options common_opts;
1319 runtime_options opts;
1320 common::singleton_initializer<common::topology::instance> topo;
1321 common::singleton_initializer<common::rma::instance> rma;
1324 core<bs> c(n_cb * bs, bs / 4);
1329 std::size_t n = 2 * n_cb * bs /
sizeof(std::size_t);
1332 ps[0] =
reinterpret_cast<std::size_t*
>(c.malloc_coll<mem_mapper::block >(n *
sizeof(std::size_t)));
1333 ps[1] =
reinterpret_cast<std::size_t*
>(c.malloc_coll<mem_mapper::cyclic>(n *
sizeof(std::size_t)));
1343 c.checkout(p + i,
sizeof(std::size_t),
mode::write);
1345 c.checkin(p + i,
sizeof(std::size_t),
mode::write);
1361 c.checkout(p + i,
sizeof(std::size_t),
mode::write);
1363 c.checkin(p + i,
sizeof(std::size_t),
mode::write);
1365 c.checkout(p + i,
sizeof(std::size_t),
mode::read);
1367 c.checkin(p + i,
sizeof(std::size_t),
mode::read);
1374 c.checkout(p + i,
sizeof(std::size_t),
mode::read);
1380 c.checkin(p + i,
sizeof(std::size_t),
mode::read);
1388 ITYR_TEST_CASE(
"[ityr::ori::core] checkout/checkin (noncollective)") {
1389 common::runtime_options common_opts;
1390 runtime_options opts;
1391 common::singleton_initializer<common::topology::instance> topo;
1392 common::singleton_initializer<common::rma::instance> rma;
1395 core<bs> c(n_cb * bs, bs / 4);
1403 int n_alloc_iter = 10;
1413 root_node->next =
nullptr;
1417 node_t* node = root_node;
1418 for (
int i = 0; i < niter; i++) {
1419 for (
int j = 0; j < n_alloc_iter; j++) {
1423 c.checkout(&node->next,
sizeof(node->next),
mode::write);
1424 node->next = new_node;
1425 c.checkin(&node->next,
sizeof(node->next),
mode::write);
1427 c.checkout(&node->value,
sizeof(node->value),
mode::read);
1428 int val = node->value;
1429 c.checkin(&node->value,
sizeof(node->value),
mode::read);
1432 new_node->next =
nullptr;
1433 new_node->value = val + 1;
1460 while (node !=
nullptr) {
1465 node_t* prev_node = node;
1470 std::destroy_at(prev_node);
1471 c.free(prev_node,
sizeof(
node_t));
1478 ITYR_TEST_CASE(
"[ityr::ori::core] release/acquire fence") {
1479 common::runtime_options common_opts;
1480 runtime_options opts;
1481 common::singleton_initializer<common::topology::instance> topo;
1482 common::singleton_initializer<common::rma::instance> rma;
1485 core<bs> c(n_cb * bs, bs / 4);
1497 int* p =
reinterpret_cast<int*
>(c.malloc_coll(
sizeof(
int)));
1528 for (
int i = 0; i < n; i++) {
1539 rh = c.release_lazy();
void * addr() const
Definition: virtual_mem.hpp:46
std::conditional_t< enable_lazy_release, release_manager::release_handler, void * > release_handler
Definition: cache_manager.hpp:220
coll_mem & create(std::size_t size, std::unique_ptr< mem_mapper::base > mmapper)
Definition: coll_mem_manager.hpp:23
void destroy(coll_mem &cm)
Definition: coll_mem_manager.hpp:34
coll_mem & get(void *addr)
Definition: coll_mem_manager.hpp:14
Definition: coll_mem.hpp:16
std::size_t effective_size() const
Definition: coll_mem.hpp:36
const common::rma::win & win() const
Definition: coll_mem.hpp:51
const common::virtual_mem & home_vm() const
Definition: coll_mem.hpp:47
std::size_t size() const
Definition: coll_mem.hpp:34
const common::virtual_mem & vm() const
Definition: coll_mem.hpp:41
const mem_mapper::base & mem_mapper() const
Definition: coll_mem.hpp:39
void * malloc_coll(std::size_t size, MemMapperArgs &&... mmargs)
Definition: core.hpp:76
void poll()
Definition: core.hpp:297
void free_coll(void *addr)
Definition: core.hpp:105
void checkout(void *addr, std::size_t size, Mode mode)
Definition: core.hpp:221
void unset_readonly_coll(void *addr, std::size_t size)
Definition: core.hpp:289
void collect_deallocated()
Definition: core.hpp:301
void checkout_complete()
Definition: core.hpp:227
void cache_prof_begin()
Definition: core.hpp:305
void * get_local_mem(void *addr)
Definition: core.hpp:322
void checkin(void *addr, std::size_t size, Mode)
Definition: core.hpp:233
void set_readonly_coll(void *addr, std::size_t size)
Definition: core.hpp:280
void acquire(release_handler rh)
Definition: core.hpp:272
void cache_prof_print() const
Definition: core.hpp:315
void free(void *addr, std::size_t size)
Definition: core.hpp:133
void * malloc(std::size_t size)
Definition: core.hpp:94
void get(const void *from_addr, void *to_addr, std::size_t size)
Definition: core.hpp:160
void cache_prof_end()
Definition: core.hpp:310
void acquire()
Definition: core.hpp:264
void release()
Definition: core.hpp:248
core_default(std::size_t cache_size, std::size_t sub_block_size)
Definition: core.hpp:66
typename cache_manager< BlockSize >::release_handler release_handler
Definition: core.hpp:256
void * malloc_coll(std::size_t size)
Definition: core.hpp:73
void put(const void *from_addr, void *to_addr, std::size_t size)
Definition: core.hpp:182
release_handler release_lazy()
Definition: core.hpp:258
static constexpr block_size_t block_size
Definition: core.hpp:71
bool checkout_nb(void *addr, std::size_t size, Mode)
Definition: core.hpp:205
void unset_readonly_coll(void *, std::size_t)
Definition: core.hpp:730
void * malloc(std::size_t size)
Definition: core.hpp:641
void release()
Definition: core.hpp:719
void free(void *addr, std::size_t size)
Definition: core.hpp:666
void cache_prof_begin()
Definition: core.hpp:738
void * release_handler
Definition: core.hpp:721
void acquire()
Definition: core.hpp:725
void get(const void *from_addr, void *to_addr, std::size_t size)
Definition: core.hpp:686
static constexpr block_size_t block_size
Definition: core.hpp:618
void poll()
Definition: core.hpp:732
void collect_deallocated()
Definition: core.hpp:734
void checkout_complete()
Definition: core.hpp:710
void * malloc_coll(std::size_t size, MemMapperArgs &&... mmargs)
Definition: core.hpp:623
void set_readonly_coll(void *, std::size_t)
Definition: core.hpp:729
core_nocache(std::size_t, std::size_t)
Definition: core.hpp:615
void checkout(void *, std::size_t, Mode)
Definition: core.hpp:706
void checkin(void *, std::size_t, Mode)
Definition: core.hpp:715
void * get_local_mem(void *addr)
Definition: core.hpp:744
void * malloc_coll(std::size_t size)
Definition: core.hpp:620
void free_coll(void *addr)
Definition: core.hpp:652
void put(const void *from_addr, void *to_addr, std::size_t size)
Definition: core.hpp:693
void cache_prof_end()
Definition: core.hpp:739
void acquire(release_handler)
Definition: core.hpp:727
release_handler release_lazy()
Definition: core.hpp:723
void cache_prof_print() const
Definition: core.hpp:740
bool checkout_nb(void *, std::size_t, Mode)
Definition: core.hpp:701
void * malloc(std::size_t size)
Definition: core.hpp:893
void put(const void *from_addr, void *to_addr, std::size_t size)
Definition: core.hpp:909
void checkout(void *, std::size_t, Mode)
Definition: core.hpp:917
core_serial(std::size_t, std::size_t)
Definition: core.hpp:882
void unset_readonly_coll(void *, std::size_t)
Definition: core.hpp:935
void * malloc_coll(std::size_t size, MemMapperArgs &&...)
Definition: core.hpp:889
static constexpr block_size_t block_size
Definition: core.hpp:884
void acquire(release_handler)
Definition: core.hpp:932
void checkin(void *, std::size_t, Mode)
Definition: core.hpp:922
void * release_handler
Definition: core.hpp:926
void cache_prof_end()
Definition: core.hpp:942
void acquire()
Definition: core.hpp:930
release_handler release_lazy()
Definition: core.hpp:928
void get(const void *from_addr, void *to_addr, std::size_t size)
Definition: core.hpp:905
void cache_prof_print() const
Definition: core.hpp:943
void * malloc_coll(std::size_t size)
Definition: core.hpp:886
void * get_local_mem(void *addr)
Definition: core.hpp:947
void poll()
Definition: core.hpp:937
void cache_prof_begin()
Definition: core.hpp:941
void free(void *addr, std::size_t)
Definition: core.hpp:901
void release()
Definition: core.hpp:924
void collect_deallocated()
Definition: core.hpp:939
void checkout_complete()
Definition: core.hpp:919
bool checkout_nb(void *, std::size_t, Mode)
Definition: core.hpp:914
void free_coll(void *addr)
Definition: core.hpp:897
void set_readonly_coll(void *, std::size_t)
Definition: core.hpp:934
virtual std::size_t local_size(int inter_rank) const =0
Definition: mem_mapper.hpp:64
Definition: mem_mapper.hpp:165
const common::rma::win & win() const
Definition: noncoll_mem.hpp:66
std::size_t get_disp(const void *p) const
Definition: noncoll_mem.hpp:76
void remote_deallocate(void *p, std::size_t bytes[[maybe_unused]], int target_rank, std::size_t alignment=alignof(max_align_t))
Definition: noncoll_mem.hpp:150
void local_deallocate(void *p, std::size_t bytes, std::size_t alignment=alignof(max_align_t))
Definition: noncoll_mem.hpp:134
common::topology::rank_t get_owner(const void *p) const
Definition: noncoll_mem.hpp:72
bool has(const void *p) const
Definition: noncoll_mem.hpp:68
void collect_deallocated()
Definition: noncoll_mem.hpp:168
#define ITYR_CONCAT(x, y)
Definition: util.hpp:20
#define ITYR_CHECK_MESSAGE(cond,...)
Definition: util.hpp:49
#define ITYR_SUBCASE(name)
Definition: util.hpp:41
#define ITYR_CHECK(cond)
Definition: util.hpp:48
#define ITYR_REQUIRE_MESSAGE(cond, msg,...)
Definition: util.hpp:43
#define ITYR_REQUIRE(cond)
Definition: util.hpp:42
int node_t
Definition: numa.hpp:76
ITYR_CONCAT(mode_, ITYR_PROFILER_MODE) mode
Definition: profiler.hpp:257
void get_nb(const win &origin_win, T *origin_addr, std::size_t count, const win &target_win, int target_rank, std::size_t target_disp)
Definition: rma.hpp:25
void flush(const win &target_win)
Definition: rma.hpp:76
void put_nb(const win &origin_win, const T *origin_addr, std::size_t count, const win &target_win, int target_rank, std::size_t target_disp)
Definition: rma.hpp:51
rank_t inter_my_rank()
Definition: topology.hpp:215
rank_t n_ranks()
Definition: topology.hpp:208
int rank_t
Definition: topology.hpp:12
MPI_Comm mpicomm()
Definition: topology.hpp:206
rank_t inter_n_ranks()
Definition: topology.hpp:216
bool is_locally_accessible(rank_t target_global_rank)
Definition: topology.hpp:224
rank_t inter2global_rank(rank_t inter_rank)
Definition: topology.hpp:222
rank_t intra_n_ranks()
Definition: topology.hpp:212
rank_t my_rank()
Definition: topology.hpp:207
T mpi_bcast_value(const T &value, int root_rank, MPI_Comm comm)
Definition: mpi_util.hpp:145
T round_down_pow2(T x, T alignment)
Definition: util.hpp:130
void mpi_wait(MPI_Request &req)
Definition: mpi_util.hpp:250
MPI_Request mpi_isend(const T *buf, std::size_t count, int target_rank, int tag, MPI_Comm comm)
Definition: mpi_util.hpp:67
MPI_Request mpi_ibarrier(MPI_Comm comm)
Definition: mpi_util.hpp:46
void mpi_barrier(MPI_Comm comm)
Definition: mpi_util.hpp:42
MPI_Request mpi_irecv(T *buf, std::size_t count, int target_rank, int tag, MPI_Comm comm)
Definition: mpi_util.hpp:107
bool mpi_test(MPI_Request &req)
Definition: mpi_util.hpp:254
void verbose(const char *fmt,...)
Definition: logger.hpp:11
void for_each_seg_blk(const coll_mem &cm, void *addr, std::size_t size, HomeSegFn home_seg_fn, CacheBlkFn cache_blk_fn)
Definition: core.hpp:36
ITYR_CONCAT(core_, ITYR_ORI_CORE)< BlockSize > core
Definition: core.hpp:951
void for_each_block(void *addr, std::size_t size, Fn fn)
Definition: core.hpp:24
constexpr read_write_t read_write
Definition: util.hpp:15
constexpr read_t read
Definition: util.hpp:11
constexpr write_t write
Definition: util.hpp:13
void for_each_mem_segment(const coll_mem &cm, const void *addr, std::size_t size, Fn fn)
Definition: coll_mem.hpp:141
void free(global_ptr< T > ptr, std::size_t count)
Definition: ori.hpp:75
std::string str(mode::read_t)
Definition: util.hpp:18
std::size_t sys_mmap_entry_limit()
Definition: util.hpp:32
core::instance::instance_type::release_handler release_handler
Definition: ori.hpp:204
global_ptr< T > malloc(std::size_t count)
Definition: ori.hpp:65
uint32_t block_size_t
Definition: util.hpp:30
monoid< T, min_functor<>, highest< T > > min
Definition: reducer.hpp:101
monoid< T, max_functor<>, lowest< T > > max
Definition: reducer.hpp:104
rank_t my_rank()
Return the rank of the process running the current thread.
Definition: ityr.hpp:99
rank_t n_ranks()
Return the total number of processes.
Definition: ityr.hpp:107
constexpr auto size(const checkout_span< T, Mode > &cs) noexcept
Definition: checkout_span.hpp:178
void barrier()
Barrier for all processes (collective).
Definition: ityr.hpp:150
ForwardIteratorD move(const ExecutionPolicy &policy, ForwardIterator1 first1, ForwardIterator1 last1, ForwardIteratorD first_d)
Move a range to another.
Definition: parallel_loop.hpp:934
#define ITYR_ORI_ENABLE_VM_MAP
#define ITYR_PROFILER_RECORD(event,...)
Definition: profiler.hpp:319
Definition: options.hpp:153
Definition: options.hpp:78
Definition: prof_events.hpp:31
Definition: prof_events.hpp:26
Definition: prof_events.hpp:21
Definition: prof_events.hpp:11
Definition: prof_events.hpp:16
Definition: options.hpp:96