9 #if ITYR_ALLOCATOR_USE_BOOST 
   10 #include <boost/container/pmr/memory_resource.hpp> 
   11 #include <boost/container/pmr/unsynchronized_pool_resource.hpp> 
   12 #include <boost/container/pmr/pool_options.hpp> 
   13 namespace ityr::common { 
namespace pmr = boost::container::pmr; }
 
   15 #include <memory_resource> 
   40       freelist_(reinterpret_cast<uintptr_t>(base_addr), max_size) {}
 
   42   void* 
do_allocate(std::size_t bytes, std::size_t alignment)
 override {
 
   44       die(
"[ityr::common::allocator] Requests for mpi_win_resource must be page-aligned");
 
   50     auto s = freelist_.
get(real_bytes, alignment);
 
   52       die(
"[ityr::common::allocator] Could not allocate memory for malloc_local()");
 
   55     void* ret = 
reinterpret_cast<void*
>(*s);
 
   58       MPI_Win_attach(win_, ret, real_bytes);
 
   64   void do_deallocate(
void* p, std::size_t bytes, std::size_t alignment)
 override {
 
   66       die(
"[ityr::common::allocator] Requests for mpi_win_resource must be page-aligned");
 
   73       MPI_Win_detach(win_, p);
 
   75       if (madvise(p, real_bytes, MADV_REMOVE) == -1) {
 
   77         die(
"[ityr::common::allocator] madvise() failed");
 
   81     freelist_.
add(
reinterpret_cast<uintptr_t
>(p), real_bytes);
 
   84   bool do_is_equal(
const pmr::memory_resource& other) 
const noexcept 
override {
 
   85     return this == &other;
 
   97     : upstream_mr_(upstream_mr),
 
  102   void* 
do_allocate(std::size_t bytes, std::size_t alignment)
 override {
 
  103     if (bytes >= block_size_) {
 
  104       return upstream_mr_->allocate(bytes, 
std::max(alignment, block_size_));
 
  107     auto s = freelist_.
get(bytes, alignment);
 
  108     if (!s.has_value()) {
 
  109       void* new_block = upstream_mr_->allocate(block_size_, block_size_);
 
  110       freelist_.
add(
reinterpret_cast<uintptr_t
>(new_block), block_size_);
 
  111       s = freelist_.
get(bytes, alignment);
 
  115     return reinterpret_cast<void*
>(*s);
 
  118   void do_deallocate(
void* p, std::size_t bytes, std::size_t alignment)
 override {
 
  119     if (bytes >= block_size_) {
 
  120       upstream_mr_->deallocate(p, bytes, 
std::max(alignment, block_size_));
 
  124     freelist_.
add(
reinterpret_cast<uintptr_t
>(p), bytes);
 
  129   bool do_is_equal(
const pmr::memory_resource& other) 
const noexcept 
override {
 
  130     return this == &other;
 
  134   pmr::memory_resource* upstream_mr_;
 
  135   std::size_t           block_size_;
 
  142     : local_max_size_(calc_local_max_size(local_max_size)),
 
  143       global_max_size_(local_max_size_ * topology::
n_ranks()),
 
  146       local_base_addr_(reinterpret_cast<std::byte*>(vm_.addr()) + local_max_size_ * topology::
my_rank()),
 
  148       win_mr_(local_base_addr_, local_max_size_, 
win()),
 
  150       std_pool_mr_(my_std_pool_options(), &block_mr_),
 
  153       collect_threshold_(std::size_t(16) * 1024),
 
  154       collect_threshold_max_(local_max_size_ * 8 / 10) {}
 
  156   MPI_Win 
win()
 const { 
return win_.
win(); }
 
  158   bool has(
const void* p)
 const {
 
  159     return vm_.
addr() <= p && p < reinterpret_cast<std::byte*>(vm_.
addr()) + global_max_size_;
 
  163     return (
reinterpret_cast<uintptr_t
>(p) - 
reinterpret_cast<uintptr_t
>(vm_.
addr())) / local_max_size_;
 
  168       return reinterpret_cast<uintptr_t
>(p);
 
  170       return (
reinterpret_cast<uintptr_t
>(p) - 
reinterpret_cast<uintptr_t
>(vm_.
addr())) % local_max_size_;
 
  174   void* 
do_allocate(std::size_t bytes, std::size_t alignment = 
alignof(max_align_t))
 override {
 
  177     std::size_t pad_bytes = 
round_up_pow2(
sizeof(header), alignment);
 
  178     std::size_t real_bytes = bytes + pad_bytes;
 
  180     if (allocated_size_ >= collect_threshold_) {
 
  182       collect_threshold_ = allocated_size_ * 2;
 
  183       if (collect_threshold_ > collect_threshold_max_) {
 
  184         collect_threshold_ = (collect_threshold_max_ + allocated_size_) / 2;
 
  188     std::byte* p = 
reinterpret_cast<std::byte*
>(std_pool_mr_.allocate(real_bytes, alignment));
 
  189     std::byte* ret = p + pad_bytes;
 
  194     header* h = 
new (p) header {
 
  195       .prev = allocated_list_end_, .next = 
nullptr,
 
  196       .size = real_bytes, .alignment = alignment, .freed = 0};
 
  197     ITYR_CHECK(allocated_list_end_->next == 
nullptr);
 
  198     allocated_list_end_->next = h;
 
  199     allocated_list_end_ = h;
 
  201     allocated_size_ += real_bytes;
 
  206   void do_deallocate(
void* p, std::size_t bytes, std::size_t alignment = 
alignof(max_align_t))
 override {
 
  215   bool do_is_equal(
const pmr::memory_resource& other) 
const noexcept 
override {
 
  216     return this == &other;
 
  219   void local_deallocate(
void* p, std::size_t bytes, std::size_t alignment = 
alignof(max_align_t)) {
 
  224     std::size_t pad_bytes = 
round_up_pow2(
sizeof(header), alignment);
 
  225     std::size_t real_bytes = bytes + pad_bytes;
 
  227     header* h = 
reinterpret_cast<header*
>(
reinterpret_cast<std::byte*
>(p) - pad_bytes);
 
  232     local_deallocate_impl(h, real_bytes, alignment);
 
  235   void remote_deallocate(
void* p, std::size_t bytes [[maybe_unused]], 
int target_rank, std::size_t alignment = 
alignof(max_align_t)) {
 
  241     static constexpr 
int one = 1;
 
  245     static int count = 0;
 
  247     if (count >= max_unflushed_free_objs_) {
 
  256     header *h = allocated_list_.next;
 
  258       if (h->freed.load(std::memory_order_acquire)) {
 
  259         header* h_next = h->next;
 
  260         local_deallocate_impl(h, h->size, h->alignment);
 
  275     std::size_t pad_bytes = 
round_up_pow2(
sizeof(header), alignment);
 
  276     header* h = 
reinterpret_cast<header*
>(
reinterpret_cast<std::byte*
>(p) - pad_bytes);
 
  278     if (h->freed.load(std::memory_order_acquire)) {
 
  279       local_deallocate_impl(h, h->size, h->alignment);
 
  287     return allocated_list_.next == 
nullptr;
 
  291   static std::string allocator_shmem_name(
int inter_rank) {
 
  292     static int count = 0;
 
  293     std::stringstream ss;
 
  294     ss << 
"/ityr_allocator_" << count++ << 
"_" << 
inter_rank;
 
  298   std::size_t calc_local_max_size(std::size_t param)
 const {
 
  307   physical_mem init_pm()
 const {
 
  324       auto offset = local_max_size_ * target_rank;
 
  325       void* begin_addr = 
reinterpret_cast<std::byte*
>(vm_.
addr()) + offset;
 
  326       pm.map_to_vm(begin_addr, local_max_size_, offset);
 
  332   mpi_win_manager<std::byte> create_win()
 const {
 
  336       auto local_base_addr = 
reinterpret_cast<std::byte*
>(vm_.
addr()) + local_max_size_ * 
topology::my_rank();
 
  343   pmr::pool_options my_std_pool_options()
 const {
 
  344     pmr::pool_options opts;
 
  345     opts.max_blocks_per_chunk = std::size_t(16) * 1024 * 1024 * 1024;
 
  350     header*          prev      = 
nullptr;
 
  351     header*          next      = 
nullptr;
 
  352     std::size_t      
size      = 0;
 
  353     std::size_t      alignment = 0;
 
  354     std::atomic<int> freed     = 0;
 
  357   void remove_header_from_list(header* h) {
 
  359     h->prev->next = h->next;
 
  362       h->next->prev = h->prev;
 
  365       allocated_list_end_ = h->prev;
 
  369   std::size_t get_header_disp(
const void* p, std::size_t alignment)
 const {
 
  370     std::size_t pad_bytes = 
round_up_pow2(
sizeof(header), alignment);
 
  371     auto h = 
reinterpret_cast<const header*
>(
reinterpret_cast<const std::byte*
>(p) - pad_bytes);
 
  372     const void* flag_addr = &h->freed;
 
  377   void local_deallocate_impl(header* h, std::size_t 
size, std::size_t alignment) {
 
  378     remove_header_from_list(h);
 
  380     std_pool_mr_.deallocate(h, 
size, alignment);
 
  383     allocated_size_ -= 
size;
 
  386   std::size_t                       local_max_size_;
 
  387   std::size_t                       global_max_size_;
 
  390   void*                             local_base_addr_;
 
  391   mpi_win_manager<std::byte>        win_;
 
  392   mpi_win_resource                  win_mr_;
 
  393   block_resource                    block_mr_;
 
  394   pmr::unsynchronized_pool_resource std_pool_mr_;
 
  395   int                               max_unflushed_free_objs_;
 
  396   header                            allocated_list_;
 
  397   header*                           allocated_list_end_ = &allocated_list_;
 
  398   std::size_t                       allocated_size_;
 
  399   std::size_t                       collect_threshold_;
 
  400   std::size_t                       collect_threshold_max_;
 
  403 template <
typename T>
 
  406     std::memcpy(origin_p, target_p, 
size * 
sizeof(T));
 
  408     auto target_rank = rmr.
get_owner(target_p);
 
  413 template <
typename T>
 
  418     auto target_rank = rmr.
get_owner(target_p);
 
  419     return mpi_get_value<T>(target_rank, rmr.
get_disp(target_p), rmr.
win());
 
  423 template <
typename T>
 
  426     std::memcpy(target_p, origin_p, 
size * 
sizeof(T));
 
  428     auto target_rank = rmr.
get_owner(target_p);
 
  433 template <
typename T>
 
  438     auto target_rank = rmr.
get_owner(target_p);
 
  443 template <
typename T>
 
  445   auto target_rank = rmr.
get_owner(target_p);
 
  452 ITYR_TEST_CASE(
"[ityr::common::allocator] basic test") {
 
  453   runtime_options opts;
 
  454   singleton_initializer<topology::instance> topo;
 
  456   remotable_resource allocator(std::size_t(16) * 1024 * 1024);
 
  459     std::vector<std::size_t> sizes = {1, 2, 4, 8, 16, 32, 100, 200, 1000, 100000, 1000000};
 
  460     constexpr 
int N = 10;
 
  461     for (
auto size : sizes) {
 
  463       for (
int i = 0; i < N; i++) {
 
  464         ptrs[i] = allocator.allocate(
size);
 
  465         for (std::size_t j = 0; j < 
size; j += 128) {
 
  466           reinterpret_cast<char*
>(ptrs[i])[j] = 0;
 
  469       for (
int i = 0; i < N; i++) {
 
  470         allocator.deallocate(ptrs[i], 
size);
 
  476     std::size_t 
size = 128;
 
  477     void* p = allocator.allocate(
size);
 
  479     for (std::size_t i = 0; i < 
size; i++) {
 
  490         std::vector<uint8_t> buf(
size);
 
  491         mpi_get_nb(buf.data(), 
size, target_rank, allocator.get_disp(addrs[target_rank]), allocator.win());
 
  494         for (std::size_t i = 0; i < 
size; i++) {
 
  502     std::vector<uint8_t> buf(
size);
 
  503     for (std::size_t i = 0; i < 
size; i++) {
 
  508     mpi_put_nb(buf.data(), 
size, target_rank, allocator.get_disp(addrs[target_rank]), allocator.win());
 
  513     for (std::size_t i = 0; i < 
size; i++) {
 
  518       allocator.deallocate(p, 
size);
 
  528         allocator.remote_deallocate(addrs[target_rank], 
size, target_rank);
 
  533         allocator.collect_deallocated();
 
Definition: allocator.hpp:93
 
void * do_allocate(std::size_t bytes, std::size_t alignment) override
Definition: allocator.hpp:102
 
void do_deallocate(void *p, std::size_t bytes, std::size_t alignment) override
Definition: allocator.hpp:118
 
block_resource(pmr::memory_resource *upstream_mr, std::size_t block_size)
Definition: allocator.hpp:95
 
bool do_is_equal(const pmr::memory_resource &other) const noexcept override
Definition: allocator.hpp:129
 
Definition: freelist.hpp:13
 
void add(uintptr_t addr, std::size_t size)
Definition: freelist.hpp:68
 
std::optional< uintptr_t > get(std::size_t size)
Definition: freelist.hpp:18
 
MPI_Win win() const
Definition: mpi_rma.hpp:409
 
Definition: allocator.hpp:34
 
void do_deallocate(void *p, std::size_t bytes, std::size_t alignment) override
Definition: allocator.hpp:64
 
bool do_is_equal(const pmr::memory_resource &other) const noexcept override
Definition: allocator.hpp:84
 
void * do_allocate(std::size_t bytes, std::size_t alignment) override
Definition: allocator.hpp:42
 
mpi_win_resource(void *base_addr, std::size_t max_size, MPI_Win win)
Definition: allocator.hpp:36
 
Definition: allocator.hpp:139
 
bool has(const void *p) const
Definition: allocator.hpp:158
 
void collect_deallocated()
Definition: allocator.hpp:253
 
void local_deallocate(void *p, std::size_t bytes, std::size_t alignment=alignof(max_align_t))
Definition: allocator.hpp:219
 
void remote_deallocate(void *p, std::size_t bytes[[maybe_unused]], int target_rank, std::size_t alignment=alignof(max_align_t))
Definition: allocator.hpp:235
 
void * do_allocate(std::size_t bytes, std::size_t alignment=alignof(max_align_t)) override
Definition: allocator.hpp:174
 
bool is_remotely_freed(void *p, std::size_t alignment=alignof(max_align_t))
Definition: allocator.hpp:272
 
MPI_Win win() const
Definition: allocator.hpp:156
 
bool empty()
Definition: allocator.hpp:286
 
void do_deallocate(void *p, std::size_t bytes, std::size_t alignment=alignof(max_align_t)) override
Definition: allocator.hpp:206
 
topology::rank_t get_owner(const void *p) const
Definition: allocator.hpp:162
 
bool do_is_equal(const pmr::memory_resource &other) const noexcept override
Definition: allocator.hpp:215
 
bool is_locally_accessible(const void *p) const
Definition: allocator.hpp:268
 
remotable_resource(std::size_t local_max_size)
Definition: allocator.hpp:141
 
std::size_t get_disp(const void *p) const
Definition: allocator.hpp:166
 
void * addr() const
Definition: virtual_mem.hpp:46
 
std::size_t size() const
Definition: virtual_mem.hpp:47
 
#define ITYR_ALLOCATOR_USE_DYNAMIC_WIN
 
#define ITYR_SUBCASE(name)
Definition: util.hpp:41
 
#define ITYR_CHECK(cond)
Definition: util.hpp:48
 
ITYR_RMA_IMPL::win win
Definition: rma.hpp:13
 
rank_t inter_my_rank()
Definition: topology.hpp:215
 
rank_t n_ranks()
Definition: topology.hpp:208
 
int rank_t
Definition: topology.hpp:12
 
rank_t inter_rank(rank_t global_rank)
Definition: topology.hpp:219
 
MPI_Comm mpicomm()
Definition: topology.hpp:206
 
rank_t intra_my_rank()
Definition: topology.hpp:211
 
bool is_locally_accessible(rank_t target_global_rank)
Definition: topology.hpp:224
 
MPI_Comm intra_mpicomm()
Definition: topology.hpp:210
 
rank_t intra_n_ranks()
Definition: topology.hpp:212
 
rank_t my_rank()
Definition: topology.hpp:207
 
rank_t intra2global_rank(rank_t intra_rank)
Definition: topology.hpp:221
 
Definition: allocator.hpp:16
 
T round_up_pow2(T x, T alignment)
Definition: util.hpp:142
 
void remote_get(const remotable_resource &rmr, T *origin_p, const T *target_p, std::size_t size)
Definition: allocator.hpp:404
 
void remote_put_value(const remotable_resource &rmr, const T &val, T *target_p)
Definition: allocator.hpp:434
 
bool is_pow2(T x)
Definition: util.hpp:125
 
T remote_faa_value(const remotable_resource &rmr, const T &val, T *target_p)
Definition: allocator.hpp:444
 
T mpi_bcast_value(const T &value, int root_rank, MPI_Comm comm)
Definition: mpi_util.hpp:145
 
void mpi_win_flush_all(MPI_Win win)
Definition: mpi_rma.hpp:31
 
void mpi_get(T *origin, std::size_t count, int target_rank, std::size_t target_disp, MPI_Win win)
Definition: mpi_rma.hpp:69
 
std::size_t get_page_size()
Definition: util.hpp:170
 
void mpi_get_nb(T *origin, std::size_t count, int target_rank, std::size_t target_disp, MPI_Win win)
Definition: mpi_rma.hpp:48
 
void remote_put(const remotable_resource &rmr, const T *origin_p, T *target_p, std::size_t size)
Definition: allocator.hpp:424
 
void mpi_put_nb(const T *origin, std::size_t count, int target_rank, std::size_t target_disp, MPI_Win win)
Definition: mpi_rma.hpp:111
 
T remote_get_value(const remotable_resource &rmr, const T *target_p)
Definition: allocator.hpp:414
 
virtual_mem reserve_same_vm_coll(std::size_t size, std::size_t alignment=alignof(max_align_t))
Definition: virtual_mem.hpp:170
 
void mpi_put(const T *origin, std::size_t count, int target_rank, std::size_t target_disp, MPI_Win win)
Definition: mpi_rma.hpp:132
 
void mpi_win_flush(int target_rank, MPI_Win win)
Definition: mpi_rma.hpp:15
 
constexpr auto size(const span< T > &s) noexcept
Definition: span.hpp:61
 
void mpi_put_value(const T &value, int target_rank, std::size_t target_disp, MPI_Win win)
Definition: mpi_rma.hpp:165
 
void mpi_atomic_put_nb(const T *origin, T *result, int target_rank, std::size_t target_disp, MPI_Win win)
Definition: mpi_rma.hpp:263
 
void mpi_barrier(MPI_Comm comm)
Definition: mpi_util.hpp:42
 
uint64_t next_pow2(uint64_t x)
Definition: util.hpp:102
 
constexpr bool use_dynamic_win
Definition: allocator.hpp:32
 
T mpi_atomic_faa_value(const T &value, int target_rank, std::size_t target_disp, MPI_Win win)
Definition: mpi_rma.hpp:193
 
constexpr block_size_t block_size
Definition: ori.hpp:19
 
monoid< T, max_functor<>, lowest< T > > max
Definition: reducer.hpp:104
 
rank_t my_rank()
Return the rank of the process running the current thread.
Definition: ityr.hpp:99
 
rank_t n_ranks()
Return the total number of processes.
Definition: ityr.hpp:107
 
#define ITYR_PROFILER_RECORD(event,...)
Definition: profiler.hpp:319
 
Definition: options.hpp:141
 
Definition: options.hpp:147
 
Definition: prof_events.hpp:104
 
Definition: prof_events.hpp:119
 
Definition: prof_events.hpp:109
 
Definition: prof_events.hpp:114