16 freelist_(reinterpret_cast<uintptr_t>(addr_), size_) {}
18 void*
do_allocate(std::size_t bytes, std::size_t alignment)
override {
21 auto s = freelist_.
get(bytes, alignment);
23 throw std::bad_alloc();
26 return reinterpret_cast<void*
>(*s);
29 void do_deallocate(
void* p, std::size_t bytes, std::size_t alignment [[maybe_unused]])
override {
33 freelist_.
add(
reinterpret_cast<uintptr_t
>(p), bytes);
36 bool do_is_equal(
const common::pmr::memory_resource& other)
const noexcept
override {
37 return this == &other;
46 class noncoll_mem final :
public common::pmr::memory_resource {
48 noncoll_mem(std::size_t local_max_size, std::size_t alignment)
49 : local_max_size_(common::
round_up_pow2(local_max_size, alignment)),
50 global_max_size_(local_max_size_ * common::topology::
n_ranks()),
52 local_base_addr_(reinterpret_cast<std::byte*>(vm_.addr()) + local_max_size_ * common::topology::
my_rank()),
54 win_(common::rma::
create_win(local_base_addr_, local_max_size_)),
55 root_mr_(local_base_addr_, local_max_size_ - sizeof(int)),
56 std_pool_mr_(my_std_pool_options(), &root_mr_),
57 max_unflushed_free_objs_(common::allocator_max_unflushed_free_objs_option::value()),
59 collect_threshold_(std::size_t(16) * 1024),
60 collect_threshold_max_(local_max_size_ * 8 / 10) {
62 *
reinterpret_cast<int*
>(
63 reinterpret_cast<std::byte*
>(local_base_addr_) + local_max_size_ -
sizeof(
int)) = remote_free_flag_value;
68 bool has(
const void* p)
const {
69 return vm_.
addr() <= p && p < reinterpret_cast<std::byte*>(vm_.
addr()) + global_max_size_;
73 return (
reinterpret_cast<uintptr_t
>(p) -
reinterpret_cast<uintptr_t
>(vm_.
addr())) / local_max_size_;
77 return (
reinterpret_cast<uintptr_t
>(p) -
reinterpret_cast<uintptr_t
>(vm_.
addr())) % local_max_size_;
80 void*
do_allocate(std::size_t bytes, std::size_t alignment =
alignof(max_align_t))
override {
84 std::size_t real_bytes = bytes + pad_bytes;
86 if (allocated_size_ >= collect_threshold_) {
92 p =
reinterpret_cast<std::byte*
>(std_pool_mr_.allocate(real_bytes, alignment));
93 }
catch (std::bad_alloc& e) {
97 p =
reinterpret_cast<std::byte*
>(std_pool_mr_.allocate(real_bytes, alignment));
98 }
catch (std::bad_alloc& e) {
100 common::die(
"[ityr::ori::noncoll_mem] Could not allocate memory for malloc_local()");
104 std::byte* ret = p + pad_bytes;
109 header* h =
new (p) header {
110 .prev = allocated_list_end_, .next =
nullptr,
111 .size = real_bytes, .alignment = alignment, .freed = 0};
112 ITYR_CHECK(allocated_list_end_->next ==
nullptr);
113 allocated_list_end_->next = h;
114 allocated_list_end_ = h;
116 allocated_size_ += real_bytes;
121 void do_deallocate(
void* p, std::size_t bytes, std::size_t alignment =
alignof(max_align_t))
override {
130 bool do_is_equal(
const common::pmr::memory_resource& other)
const noexcept
override {
131 return this == &other;
134 void local_deallocate(
void* p, std::size_t bytes, std::size_t alignment =
alignof(max_align_t)) {
140 std::size_t real_bytes = bytes + pad_bytes;
142 header* h =
reinterpret_cast<header*
>(
reinterpret_cast<std::byte*
>(p) - pad_bytes);
147 local_deallocate_impl(h, real_bytes, alignment);
150 void remote_deallocate(
void* p, std::size_t bytes [[maybe_unused]],
int target_rank, std::size_t alignment =
alignof(max_align_t)) {
156 int* flag_val_p =
reinterpret_cast<int*
>(
157 reinterpret_cast<std::byte*
>(local_base_addr_) + local_max_size_ -
sizeof(
int));
160 static int count = 0;
162 if (count >= max_unflushed_free_objs_) {
171 header *h = allocated_list_.next;
173 int flag = h->freed.load(std::memory_order_acquire);
176 header* h_next = h->next;
177 local_deallocate_impl(h, h->size, h->alignment);
184 collect_threshold_ = allocated_size_ * 2;
185 if (collect_threshold_ > collect_threshold_max_) {
186 collect_threshold_ = (collect_threshold_max_ + allocated_size_) / 2;
198 header* h =
reinterpret_cast<header*
>(
reinterpret_cast<std::byte*
>(p) - pad_bytes);
200 if (h->freed.load(std::memory_order_acquire)) {
201 local_deallocate_impl(h, h->size, h->alignment);
209 return allocated_list_.next ==
nullptr;
213 static std::string allocator_shmem_name(
int inter_rank) {
214 static int count = 0;
215 std::stringstream ss;
216 ss <<
"/ityr_noncoll_" << count++ <<
"_" <<
inter_rank;
237 auto offset = local_max_size_ * target_rank;
238 void* begin_addr =
reinterpret_cast<std::byte*
>(vm_.
addr()) + offset;
239 pm.
map_to_vm(begin_addr, local_max_size_, offset);
250 common::pmr::pool_options my_std_pool_options()
const {
251 common::pmr::pool_options opts;
252 opts.max_blocks_per_chunk = local_max_size_ / 10;
257 header* prev =
nullptr;
258 header* next =
nullptr;
259 std::size_t
size = 0;
260 std::size_t alignment = 0;
261 std::atomic<int> freed = 0;
264 void remove_header_from_list(header* h) {
266 h->prev->next = h->next;
269 h->next->prev = h->prev;
272 allocated_list_end_ = h->prev;
276 std::size_t get_header_disp(
const void* p, std::size_t alignment)
const {
278 auto h =
reinterpret_cast<const header*
>(
reinterpret_cast<const std::byte*
>(p) - pad_bytes);
279 const void* flag_addr = &h->freed;
284 void local_deallocate_impl(header* h, std::size_t
size, std::size_t alignment) {
285 remove_header_from_list(h);
287 std_pool_mr_.deallocate(h,
size, alignment);
290 allocated_size_ -=
size;
293 static constexpr
int remote_free_flag_value = 417;
295 std::size_t local_max_size_;
296 std::size_t global_max_size_;
297 common::virtual_mem vm_;
298 void* local_base_addr_;
299 common::physical_mem pm_;
300 std::unique_ptr<common::rma::win> win_;
301 root_resource root_mr_;
302 common::pmr::unsynchronized_pool_resource std_pool_mr_;
303 int max_unflushed_free_objs_;
304 header allocated_list_;
305 header* allocated_list_end_ = &allocated_list_;
306 std::size_t allocated_size_;
307 std::size_t collect_threshold_;
308 std::size_t collect_threshold_max_;
Definition: freelist.hpp:13
void add(uintptr_t addr, std::size_t size)
Definition: freelist.hpp:68
std::optional< uintptr_t > get(std::size_t size)
Definition: freelist.hpp:18
Definition: physical_mem.hpp:18
void map_to_vm(void *addr, std::size_t size, std::size_t offset) const
Definition: physical_mem.hpp:43
void * addr() const
Definition: virtual_mem.hpp:46
std::size_t size() const
Definition: virtual_mem.hpp:47
Definition: noncoll_mem.hpp:46
const common::rma::win & win() const
Definition: noncoll_mem.hpp:66
void do_deallocate(void *p, std::size_t bytes, std::size_t alignment=alignof(max_align_t)) override
Definition: noncoll_mem.hpp:121
std::size_t get_disp(const void *p) const
Definition: noncoll_mem.hpp:76
void remote_deallocate(void *p, std::size_t bytes[[maybe_unused]], int target_rank, std::size_t alignment=alignof(max_align_t))
Definition: noncoll_mem.hpp:150
void local_deallocate(void *p, std::size_t bytes, std::size_t alignment=alignof(max_align_t))
Definition: noncoll_mem.hpp:134
bool is_remotely_freed(void *p, std::size_t alignment=alignof(max_align_t))
Definition: noncoll_mem.hpp:194
bool empty()
Definition: noncoll_mem.hpp:208
bool is_locally_accessible(const void *p) const
Definition: noncoll_mem.hpp:190
bool do_is_equal(const common::pmr::memory_resource &other) const noexcept override
Definition: noncoll_mem.hpp:130
common::topology::rank_t get_owner(const void *p) const
Definition: noncoll_mem.hpp:72
bool has(const void *p) const
Definition: noncoll_mem.hpp:68
void collect_deallocated()
Definition: noncoll_mem.hpp:168
void * do_allocate(std::size_t bytes, std::size_t alignment=alignof(max_align_t)) override
Definition: noncoll_mem.hpp:80
noncoll_mem(std::size_t local_max_size, std::size_t alignment)
Definition: noncoll_mem.hpp:48
Definition: noncoll_mem.hpp:11
void * do_allocate(std::size_t bytes, std::size_t alignment) override
Definition: noncoll_mem.hpp:18
root_resource(void *addr, std::size_t size)
Definition: noncoll_mem.hpp:13
bool do_is_equal(const common::pmr::memory_resource &other) const noexcept override
Definition: noncoll_mem.hpp:36
void do_deallocate(void *p, std::size_t bytes, std::size_t alignment[[maybe_unused]]) override
Definition: noncoll_mem.hpp:29
#define ITYR_CHECK_MESSAGE(cond,...)
Definition: util.hpp:49
#define ITYR_CHECK(cond)
Definition: util.hpp:48
void bind_to(void *, std::size_t, node_t)
Definition: numa.hpp:88
void flush(const win &target_win)
Definition: rma.hpp:76
std::unique_ptr< win > create_win(T *baseptr, std::size_t count)
Definition: rma.hpp:16
ITYR_RMA_IMPL::win win
Definition: rma.hpp:13
void put_nb(const win &origin_win, const T *origin_addr, std::size_t count, const win &target_win, int target_rank, std::size_t target_disp)
Definition: rma.hpp:51
rank_t inter_my_rank()
Definition: topology.hpp:215
bool numa_enabled()
Definition: topology.hpp:226
int rank_t
Definition: topology.hpp:12
rank_t inter_rank(rank_t global_rank)
Definition: topology.hpp:219
rank_t intra_my_rank()
Definition: topology.hpp:211
bool is_locally_accessible(rank_t target_global_rank)
Definition: topology.hpp:224
MPI_Comm intra_mpicomm()
Definition: topology.hpp:210
rank_t intra_n_ranks()
Definition: topology.hpp:212
rank_t my_rank()
Definition: topology.hpp:207
numa::node_t numa_my_node()
Definition: topology.hpp:227
rank_t intra2global_rank(rank_t intra_rank)
Definition: topology.hpp:221
T round_up_pow2(T x, T alignment)
Definition: util.hpp:142
virtual_mem reserve_same_vm_coll(std::size_t size, std::size_t alignment=alignof(max_align_t))
Definition: virtual_mem.hpp:170
void mpi_barrier(MPI_Comm comm)
Definition: mpi_util.hpp:42
Definition: block_region_set.hpp:9
rank_t my_rank()
Return the rank of the process running the current thread.
Definition: ityr.hpp:99
rank_t n_ranks()
Return the total number of processes.
Definition: ityr.hpp:107
constexpr auto size(const checkout_span< T, Mode > &cs) noexcept
Definition: checkout_span.hpp:178
#define ITYR_PROFILER_RECORD(event,...)
Definition: profiler.hpp:319
Definition: prof_events.hpp:104
Definition: prof_events.hpp:119
Definition: prof_events.hpp:109
Definition: prof_events.hpp:114