24 template <block_
size_t BlockSize>
32 : cache_size_(cache_size),
33 sub_block_size_(sub_block_size),
34 vm_(cache_size_, BlockSize),
36 cs_(cache_size / BlockSize, cache_block(this)),
37 cache_win_(common::rma::
create_win(reinterpret_cast<std::byte*>(vm_.addr()), vm_.
size())),
38 cache_tlb_(nullptr, nullptr),
40 cprof_(cs_.num_entries()) {
49 template <
bool SkipFetch,
bool IncrementRef>
57 if (blk_addr + BlockSize < addr +
size) {
64 auto cb_p = cache_tlb_.
get(blk_addr);
66 if (!cb_p)
return {
false,
false};
68 cache_block& cb = *cb_p;
70 bool fetch_completed =
true;
74 if constexpr (SkipFetch) {
75 cprof_.record_writeonly(cb.entry_idx, br, cb.valid_regions);
76 cb.valid_regions.add(br);
78 if (fetch_begin(cb, br)) {
79 add_fetching_win(*cb.win);
80 fetch_completed =
false;
84 if constexpr (IncrementRef) {
89 return {
true, fetch_completed};
92 template <
bool SkipFetch,
bool IncrementRef>
94 std::byte* req_addr_b,
95 std::byte* req_addr_e,
98 std::size_t pm_offset) {
101 ITYR_CHECK(req_addr_b <= blk_addr + BlockSize);
102 ITYR_CHECK(req_addr_e <= blk_addr + BlockSize);
105 cache_block& cb = get_entry(blk_addr);
107 bool checkout_completed =
true;
109 if (blk_addr != cb.mapped_addr) {
113 cb.pm_offset = pm_offset;
114 if constexpr (enable_vm_map) {
115 cache_blocks_to_map_.push_back(&cb);
116 checkout_completed =
false;
118 cb.mapped_addr = blk_addr;
122 block_region br = {req_addr_b - blk_addr, req_addr_e - blk_addr};
124 if constexpr (SkipFetch) {
125 cprof_.record_writeonly(cb.entry_idx, br, cb.valid_regions);
126 cb.valid_regions.add(br);
128 if (fetch_begin(cb, br)) {
129 add_fetching_win(
win);
130 checkout_completed =
false;
134 if constexpr (IncrementRef) {
139 cache_tlb_.
add(blk_addr, &cb);
141 return checkout_completed;
146 if constexpr (enable_vm_map) {
147 if (!cache_blocks_to_map_.empty()) {
148 for (cache_block* cb : cache_blocks_to_map_) {
151 cache_blocks_to_map_.clear();
158 template <
bool RegisterDirty,
bool DecrementRef>
166 if (blk_addr + BlockSize < addr +
size) {
173 auto cb_p = cache_tlb_.
get(blk_addr);
175 if (!cb_p)
return false;
177 cache_block& cb = *cb_p;
179 if constexpr (RegisterDirty) {
181 add_dirty_region(cb, br);
184 if constexpr (DecrementRef) {
192 template <
bool RegisterDirty,
bool DecrementRef>
194 std::byte* req_addr_b,
195 std::byte* req_addr_e) {
198 ITYR_CHECK(req_addr_b <= blk_addr + BlockSize);
199 ITYR_CHECK(req_addr_e <= blk_addr + BlockSize);
202 cache_block& cb = get_entry<false>(blk_addr);
204 if constexpr (RegisterDirty) {
205 block_region br = {req_addr_b - blk_addr, req_addr_e - blk_addr};
206 add_dirty_region(cb, br);
209 if constexpr (DecrementRef) {
220 using release_handler = std::conditional_t<enable_lazy_release, release_manager::release_handler, void*>;
223 if constexpr (enable_lazy_release) {
224 if (has_dirty_cache_) {
243 template <
typename ReleaseHandler>
248 if constexpr (enable_lazy_release) {
255 readonly_regions_.
add({
reinterpret_cast<uintptr_t
>(addr),
256 reinterpret_cast<uintptr_t
>(addr) +
size});
260 readonly_regions_.
remove({
reinterpret_cast<uintptr_t
>(addr),
261 reinterpret_cast<uintptr_t
>(addr) +
size});
265 if constexpr (enable_lazy_release) {
277 writeback_complete();
290 std::byte* req_addr_b,
291 std::byte* req_addr_e) {
294 ITYR_CHECK(req_addr_b <= blk_addr + BlockSize);
295 ITYR_CHECK(req_addr_e <= blk_addr + BlockSize);
298 if (is_cached(blk_addr)) {
299 cache_block& cb = get_entry(blk_addr);
301 if (cb.is_writing_back()) {
302 writeback_complete();
305 block_region br = {req_addr_b - blk_addr, req_addr_e - blk_addr};
306 cb.dirty_regions.remove(br);
311 std::byte* req_addr_b,
312 std::byte* req_addr_e,
313 std::byte* to_addr) {
316 ITYR_CHECK(req_addr_b <= blk_addr + BlockSize);
317 ITYR_CHECK(req_addr_e <= blk_addr + BlockSize);
321 cache_block& cb = get_entry<false>(blk_addr);
325 std::size_t blk_offset = req_addr_b - blk_addr;
326 std::byte* from_addr =
reinterpret_cast<std::byte*
>(vm_.
addr()) + cb.entry_idx * BlockSize + blk_offset;
327 std::memcpy(to_addr, from_addr, req_addr_e - req_addr_b);
331 std::byte* req_addr_b,
332 std::byte* req_addr_e,
333 const std::byte* from_addr) {
336 ITYR_CHECK(req_addr_b <= blk_addr + BlockSize);
337 ITYR_CHECK(req_addr_e <= blk_addr + BlockSize);
341 cache_block& cb = get_entry<false>(blk_addr);
345 std::size_t blk_offset = req_addr_b - blk_addr;
346 std::byte* to_addr =
reinterpret_cast<std::byte*
>(vm_.
addr()) + cb.entry_idx * BlockSize + blk_offset;
347 std::memcpy(to_addr, from_addr, req_addr_e - req_addr_b);
355 using writeback_epoch_t = uint64_t;
359 std::byte* addr =
nullptr;
360 std::byte* mapped_addr =
nullptr;
363 std::size_t pm_offset = 0;
365 writeback_epoch_t writeback_epoch = 0;
370 explicit cache_block(
cache_manager* outer_p) : outer(outer_p) {}
372 bool is_writing_back()
const {
373 return writeback_epoch == outer->writeback_epoch_;
377 outer->cprof_.invalidate(entry_idx, valid_regions);
381 valid_regions.
clear();
384 common::verbose<3>(
"Cache block %ld for [%p, %p) invalidated",
385 entry_idx, addr, addr + BlockSize);
390 bool is_evictable()
const {
391 return ref_count == 0 &&
392 dirty_regions.
empty() &&
401 outer->cache_tlb_.
clear();
409 static std::string cache_shmem_name(
int global_rank) {
410 std::stringstream ss;
411 ss <<
"/ityr_ori_cache_" << global_rank;
415 common::physical_mem init_cache_pm() {
417 pm.map_to_vm(vm_.
addr(), vm_.
size(), 0);
421 using cache_key_t = uintptr_t;
423 cache_key_t cache_key(
void* addr)
const {
425 ITYR_CHECK(
reinterpret_cast<uintptr_t
>(addr) % BlockSize == 0);
426 return reinterpret_cast<uintptr_t
>(addr) / BlockSize;
434 template <
bool UpdateLRU = true>
435 cache_block& get_entry(
void* addr) {
437 return cs_.template ensure_cached<UpdateLRU>(cache_key(addr));
438 }
catch (cache_full_exception& e) {
442 return cs_.template ensure_cached<UpdateLRU>(cache_key(addr));
443 }
catch (cache_full_exception& e) {
444 common::die(
"cache is exhausted (too much checked-out memory)");
449 void update_mapping(cache_block& cb) {
453 if (cb.mapped_addr) {
454 common::verbose<3>(
"Unmap cache block %d from [%p, %p) (size=%ld)",
455 cb.entry_idx, cb.mapped_addr, cb.mapped_addr + BlockSize, BlockSize);
460 common::verbose<3>(
"Map cache block %d to [%p, %p) (size=%ld)",
461 cb.entry_idx, cb.addr, cb.addr + BlockSize, BlockSize);
462 pm_.
map_to_vm(cb.addr, BlockSize, cb.entry_idx * BlockSize);
463 cb.mapped_addr = cb.addr;
469 if (cb.valid_regions.include(br)) {
471 cprof_.record(cb.entry_idx, br, {});
477 std::byte* cache_begin =
reinterpret_cast<std::byte*
>(vm_.
addr());
482 for (
auto [blk_offset_b, blk_offset_e] : fetch_regions) {
485 std::byte* addr = cache_begin + cb.entry_idx * BlockSize + blk_offset_b;
486 std::size_t
size = blk_offset_e - blk_offset_b;
487 std::size_t pm_offset = cb.pm_offset + blk_offset_b;
489 common::verbose<3>(
"Fetching [%p, %p) (%ld bytes) to cache block %d from rank %d (win=%p, disp=%ld)",
490 cb.addr + blk_offset_b, cb.addr + blk_offset_e,
size,
491 cb.entry_idx, cb.owner, cb.win, pm_offset);
496 cb.valid_regions.add(br_pad);
498 cprof_.record(cb.entry_idx, br, fetch_regions);
503 void fetch_complete() {
504 if (!fetching_wins_.empty()) {
508 common::verbose<3>(
"Fetch complete (win=%p)",
win);
510 fetching_wins_.clear();
515 if (fetching_wins_.empty() || fetching_wins_.back() != &
win) {
517 fetching_wins_.push_back(&
win);
521 void add_dirty_region(cache_block& cb,
block_region br) {
522 bool is_new_dirty_block = cb.dirty_regions.empty();
524 cb.dirty_regions.add(br);
526 if (is_new_dirty_block) {
527 dirty_cache_blocks_.push_back(&cb);
528 has_dirty_cache_ =
true;
530 if constexpr (enable_write_through) {
533 }
else if (dirty_cache_blocks_.size() >= max_dirty_cache_blocks_) {
539 void writeback_begin() {
540 for (
auto& cb : dirty_cache_blocks_) {
541 if (!cb->dirty_regions.empty()) {
542 writeback_begin(*cb);
545 dirty_cache_blocks_.clear();
548 void writeback_begin(cache_block& cb) {
549 if (cb.writeback_epoch == writeback_epoch_) {
553 writeback_complete();
554 ITYR_CHECK(cb.writeback_epoch < writeback_epoch_);
557 std::byte* cache_begin =
reinterpret_cast<std::byte*
>(vm_.
addr());
559 for (
auto [blk_offset_b, blk_offset_e] : cb.dirty_regions) {
562 std::byte* addr = cache_begin + cb.entry_idx * BlockSize + blk_offset_b;
563 std::size_t
size = blk_offset_e - blk_offset_b;
564 std::size_t pm_offset = cb.pm_offset + blk_offset_b;
566 common::verbose<3>(
"Writing back [%p, %p) (%ld bytes) to rank %d (win=%p, disp=%ld)",
567 cb.addr + blk_offset_b, cb.addr + blk_offset_e,
size,
568 cb.owner, cb.win, pm_offset);
573 cb.dirty_regions.clear();
575 cb.writeback_epoch = writeback_epoch_;
577 writing_back_wins_.push_back(cb.win);
580 void writeback_complete() {
581 if (!writing_back_wins_.empty()) {
584 std::sort(writing_back_wins_.begin(), writing_back_wins_.end());
585 writing_back_wins_.erase(std::unique(writing_back_wins_.begin(), writing_back_wins_.end()), writing_back_wins_.end());
589 common::verbose<3>(
"Writing back complete (win=%p)",
win);
591 writing_back_wins_.clear();
596 if (dirty_cache_blocks_.empty() && has_dirty_cache_) {
597 has_dirty_cache_ =
false;
602 bool is_cached(
void* addr)
const {
606 void invalidate_all() {
607 if (readonly_regions_.
empty()) {
613 if (cb.valid_regions.empty())
return;
616 uintptr_t blk_addr =
reinterpret_cast<uintptr_t
>(cb.addr);
617 region<uintptr_t> blk_addr_range = {blk_addr, blk_addr + BlockSize};
619 region_set<uintptr_t> blk_readonly_ranges =
get_intersection(readonly_regions_, blk_addr_range);
620 if (blk_readonly_ranges.empty()) {
624 }
else if (*blk_readonly_ranges.begin() != blk_addr_range) {
626 block_region_set brs_ro;
627 auto it = brs_ro.before_begin();
628 for (const auto& r : blk_readonly_ranges) {
629 it = brs_ro.add({r.begin - blk_addr, r.end - blk_addr}, it);
637 using cache_tlb = tlb<std::byte*, cache_block*, ITYR_ORI_CACHE_TLB_SIZE>;
639 std::size_t cache_size_;
642 common::virtual_mem vm_;
643 common::physical_mem pm_;
645 cache_system<cache_key_t, cache_block> cs_;
647 std::unique_ptr<common::rma::win> cache_win_;
649 cache_tlb cache_tlb_;
651 std::vector<const common::rma::win*> fetching_wins_;
652 std::vector<cache_block*> cache_blocks_to_map_;
654 std::vector<cache_block*> dirty_cache_blocks_;
655 std::size_t max_dirty_cache_blocks_;
660 writeback_epoch_t writeback_epoch_ = 1;
661 std::vector<const common::rma::win*> writing_back_wins_;
666 bool has_dirty_cache_ =
false;
671 region_set<uintptr_t> readonly_regions_;
void map_to_vm(void *addr, std::size_t size, std::size_t offset) const
Definition: physical_mem.hpp:43
void * addr() const
Definition: virtual_mem.hpp:46
std::size_t size() const
Definition: virtual_mem.hpp:47
Definition: cache_manager.hpp:25
void checkin_blk(std::byte *blk_addr, std::byte *req_addr_b, std::byte *req_addr_e)
Definition: cache_manager.hpp:193
void get_copy_blk(std::byte *blk_addr, std::byte *req_addr_b, std::byte *req_addr_e, std::byte *to_addr)
Definition: cache_manager.hpp:310
void poll()
Definition: cache_manager.hpp:264
auto release_lazy()
Definition: cache_manager.hpp:222
cache_manager(std::size_t cache_size, std::size_t sub_block_size)
Definition: cache_manager.hpp:31
void cache_prof_begin()
Definition: cache_manager.hpp:350
void ensure_all_cache_clean()
Definition: cache_manager.hpp:275
void set_readonly(void *addr, std::size_t size)
Definition: cache_manager.hpp:254
void put_copy_blk(std::byte *blk_addr, std::byte *req_addr_b, std::byte *req_addr_e, const std::byte *from_addr)
Definition: cache_manager.hpp:330
void clear_tlb()
Definition: cache_manager.hpp:285
std::conditional_t< enable_lazy_release, release_manager::release_handler, void * > release_handler
Definition: cache_manager.hpp:220
void cache_prof_end()
Definition: cache_manager.hpp:351
void cache_prof_print() const
Definition: cache_manager.hpp:352
std::pair< bool, bool > checkout_fast(std::byte *addr, std::size_t size)
Definition: cache_manager.hpp:50
void checkout_complete()
Definition: cache_manager.hpp:144
bool checkout_blk(std::byte *blk_addr, std::byte *req_addr_b, std::byte *req_addr_e, const common::rma::win &win, common::topology::rank_t owner, std::size_t pm_offset)
Definition: cache_manager.hpp:93
void release()
Definition: cache_manager.hpp:215
void unset_readonly(void *addr, std::size_t size)
Definition: cache_manager.hpp:259
void ensure_evicted(void *addr)
Definition: cache_manager.hpp:281
void acquire()
Definition: cache_manager.hpp:235
bool checkin_fast(std::byte *addr, std::size_t size)
Definition: cache_manager.hpp:159
void acquire(ReleaseHandler rh)
Definition: cache_manager.hpp:244
void discard_dirty(std::byte *blk_addr, std::byte *req_addr_b, std::byte *req_addr_e)
Definition: cache_manager.hpp:289
void for_each_entry(Func &&f)
Definition: cache_system.hpp:92
bool is_cached(Key key) const
Definition: cache_system.hpp:47
cache_entry_idx_t num_entries() const
Definition: cache_system.hpp:45
void ensure_evicted(Key key)
Definition: cache_system.hpp:77
void remove(const region< T > &r)
Definition: block_region_set.hpp:121
region_set< T > complement(region< T > r) const
Definition: block_region_set.hpp:162
bool empty() const
Definition: block_region_set.hpp:82
iterator add(const region< T > &r, iterator begin_it)
Definition: block_region_set.hpp:90
void clear()
Definition: block_region_set.hpp:86
release_handler get_dummy_handler() const
Definition: release_manager.hpp:40
void increment_epoch()
Definition: release_manager.hpp:32
bool release_requested() const
Definition: release_manager.hpp:63
release_handler get_release_handler() const
Definition: release_manager.hpp:36
void ensure_released(const release_handler &rh)
Definition: release_manager.hpp:44
void clear()
Definition: tlb.hpp:61
Entry get(const Key &key)
Definition: tlb.hpp:42
constexpr static bool enabled
Definition: tlb.hpp:14
void add(const Key &key, const Entry &entry)
Definition: tlb.hpp:21
#define ITYR_CHECK(cond)
Definition: util.hpp:48
void get_nb(const win &origin_win, T *origin_addr, std::size_t count, const win &target_win, int target_rank, std::size_t target_disp)
Definition: rma.hpp:25
void flush(const win &target_win)
Definition: rma.hpp:76
std::unique_ptr< win > create_win(T *baseptr, std::size_t count)
Definition: rma.hpp:16
ITYR_RMA_IMPL::win win
Definition: rma.hpp:13
void put_nb(const win &origin_win, const T *origin_addr, std::size_t count, const win &target_win, int target_rank, std::size_t target_disp)
Definition: rma.hpp:51
rank_t n_ranks()
Definition: topology.hpp:208
int rank_t
Definition: topology.hpp:12
rank_t my_rank()
Definition: topology.hpp:207
T round_up_pow2(T x, T alignment)
Definition: util.hpp:142
bool is_pow2(T x)
Definition: util.hpp:125
T round_down_pow2(T x, T alignment)
Definition: util.hpp:130
void * mmap_no_physical_mem(void *addr, std::size_t size, bool replace=false, std::size_t alignment=alignof(max_align_t))
Definition: virtual_mem.hpp:110
Definition: block_region_set.hpp:9
region_set< block_size_t > block_region_set
Definition: block_region_set.hpp:280
region< block_size_t > block_region
Definition: block_region_set.hpp:279
ITYR_CONCAT(cache_profiler_, ITYR_ORI_CACHE_PROF) cache_profiler
Definition: cache_profiler.hpp:163
region< T > get_intersection(const region< T > &r1, const region< T > &r2)
Definition: block_region_set.hpp:56
uint32_t block_size_t
Definition: util.hpp:30
int cache_entry_idx_t
Definition: cache_system.hpp:30
monoid< T, max_functor<>, lowest< T > > max
Definition: reducer.hpp:104
void sort(const ExecutionPolicy &policy, RandomAccessIterator first, RandomAccessIterator last, Compare comp)
Sort a range.
Definition: parallel_sort.hpp:210
constexpr auto size(const checkout_span< T, Mode > &cs) noexcept
Definition: checkout_span.hpp:178
#define ITYR_ORI_ENABLE_LAZY_RELEASE
#define ITYR_ORI_ENABLE_WRITE_THROUGH
#define ITYR_ORI_ENABLE_VM_MAP
#define ITYR_PROFILER_RECORD(event,...)
Definition: profiler.hpp:319
Definition: options.hpp:72
Definition: prof_events.hpp:46
Definition: prof_events.hpp:41
Definition: prof_events.hpp:36
Definition: block_region_set.hpp:12