randws_8hpp_source.html

 #pragma once


 #include "ityr/common/util.hpp"

 #include "ityr/common/mpi_util.hpp"

 #include "ityr/common/mpi_rma.hpp"

 #include "ityr/common/topology.hpp"

 #include "ityr/common/logger.hpp"

 #include "ityr/common/allocator.hpp"

 #include "ityr/common/profiler.hpp"

 #include "ityr/ito/util.hpp"

 #include "ityr/ito/options.hpp"

 #include "ityr/ito/context.hpp"

 #include "ityr/ito/callstack.hpp"

 #include "ityr/ito/wsqueue.hpp"

 #include "ityr/ito/prof_events.hpp"

 #include "ityr/ito/sched/util.hpp"


 namespace ityr::ito {


 class scheduler_randws {

 public:

   struct suspended_state {

     void*       evacuation_ptr;

     void*       frame_base;

     std::size_t frame_size;

   };


   template <typename T>

   struct thread_retval {

     T            value;

     dag_profiler dag_prof;

   };


   template <typename T>

   struct thread_state {

     thread_retval<T> retval;

     int              resume_flag = 0;

     suspended_state  suspended;

   };


   template <typename T>

   struct thread_handler {

     thread_state<T>* state      = nullptr;

     bool             serialized = false;

     thread_retval<T> retval_ser; // return the result by value if the thread is serialized

   };


   struct task_group_data {

     task_group_data* parent = nullptr;

     dag_profiler     dag_prof_before;

     dag_profiler     dag_prof_acc;

   };


   struct thread_local_storage {

     task_group_data* tgdata = nullptr;

     dag_profiler     dag_prof;

   };


   scheduler_randws()

     : stack_(stack_size_option::value()),

       // Add a margin of sizeof(context_frame) to the bottom of the stack, because

       // this region can be accessed by the clear_parent_frame() function later.

       // This stack base is updated only in coll_exec().

       stack_base_(reinterpret_cast<context_frame*>(stack_.bottom()) - 1),

       wsq_(wsqueue_capacity_option::value()),

       thread_state_allocator_(thread_state_allocator_size_option::value()),

       suspended_thread_allocator_(suspended_thread_allocator_size_option::value()) {}


   template <typename T, typename SchedLoopCallback, typename Fn, typename... Args>

   T root_exec(SchedLoopCallback cb, Fn&& fn, Args&&... args) {

     common::profiler::switch_phase<prof_phase_spmd, prof_phase_sched_fork>();


     thread_state<T>* ts = new (thread_state_allocator_.allocate(sizeof(thread_state<T>))) thread_state<T>;


     auto prev_sched_cf = sched_cf_;


     suspend([&](context_frame* cf) {

       sched_cf_ = cf;

       root_on_stack([&, ts, fn = std::forward<Fn>(fn),

                      args_tuple = std::make_tuple(std::forward<Args>(args)...)]() mutable {

         common::verbose("Starting root thread %p", ts);


         tls_ = new (alloca(sizeof(thread_local_storage))) thread_local_storage{};


         tls_->dag_prof.start();

         tls_->dag_prof.increment_thread_count();

         tls_->dag_prof.increment_strand_count();


         common::profiler::switch_phase<prof_phase_sched_fork, prof_phase_thread>();


         T&& ret = invoke_fn<T>(std::forward<decltype(fn)>(fn), std::forward<decltype(args_tuple)>(args_tuple));


         common::profiler::switch_phase<prof_phase_thread, prof_phase_sched_die>();

         common::verbose("Root thread %p is completed", ts);


         tls_->dag_prof.stop();


         on_root_die(ts, std::move(ret));

       });

     });


     sched_loop(cb);


     common::profiler::switch_phase<prof_phase_sched_loop, prof_phase_sched_join>();


     thread_retval<T> retval = std::move(ts->retval);

     std::destroy_at(ts);

     thread_state_allocator_.deallocate(ts, sizeof(thread_state<T>));


     if (dag_prof_enabled_) {

       if (tls_) {

         // nested root/coll_exec()

         tls_->dag_prof.merge_serial(retval.dag_prof);

       } else {

         dag_prof_result_.merge_serial(retval.dag_prof);

       }

     }


     sched_cf_ = prev_sched_cf;


     common::profiler::switch_phase<prof_phase_sched_join, prof_phase_spmd>();


     return std::move(retval.value);

   }


   template <typename T, typename OnDriftForkCallback, typename OnDriftDieCallback,

             typename WorkHint, typename Fn, typename... Args>

   void fork(thread_handler<T>& th,

             OnDriftForkCallback on_drift_fork_cb, OnDriftDieCallback on_drift_die_cb,

             WorkHint, WorkHint, Fn&& fn, Args&&... args) {

     common::profiler::switch_phase<prof_phase_thread, prof_phase_sched_fork>();


     thread_state<T>* ts = new (thread_state_allocator_.allocate(sizeof(thread_state<T>))) thread_state<T>;

     th.state = ts;

     th.serialized = false;


     suspend([&, ts, fn = std::forward<Fn>(fn),

              args_tuple = std::make_tuple(std::forward<Args>(args)...)](context_frame* cf) mutable {

       common::verbose<2>("push context frame [%p, %p) into task queue", cf, cf->parent_frame);


       tls_ = new (alloca(sizeof(thread_local_storage))) thread_local_storage{};


       std::size_t cf_size = reinterpret_cast<uintptr_t>(cf->parent_frame) - reinterpret_cast<uintptr_t>(cf);

       wsq_.push(wsqueue_entry{cf, cf_size});


       tls_->dag_prof.start();

       tls_->dag_prof.increment_thread_count();

       tls_->dag_prof.increment_strand_count();


       common::verbose<2>("Starting new thread %p", ts);

       common::profiler::switch_phase<prof_phase_sched_fork, prof_phase_thread>();


       T&& ret = invoke_fn<T>(std::forward<decltype(fn)>(fn), std::forward<decltype(args_tuple)>(args_tuple));


       common::profiler::switch_phase<prof_phase_thread, prof_phase_sched_die>();

       common::verbose<2>("Thread %p is completed", ts);


       on_task_die();

       on_die(ts, std::move(ret), on_drift_die_cb);


       common::verbose<2>("Thread %p is serialized (fast path)", ts);


       // The following is executed only when the thread is serialized

       std::destroy_at(ts);

       thread_state_allocator_.deallocate(ts, sizeof(thread_state<T>));

       th.state      = nullptr;

       th.serialized = true;

       th.retval_ser = {std::move(ret), tls_->dag_prof};


       common::verbose<2>("Resume parent context frame [%p, %p) (fast path)", cf, cf->parent_frame);


       common::profiler::switch_phase<prof_phase_sched_die, prof_phase_sched_resume_popped>();

     });


     if (th.serialized) {

       common::profiler::switch_phase<prof_phase_sched_resume_popped, prof_phase_thread>();

     } else {

       call_with_prof_events<prof_phase_sched_resume_stolen,

                             prof_phase_cb_drift_fork,

                             prof_phase_thread>(on_drift_fork_cb);

     }


     // restart to count only the last task in the task group

     tls_->dag_prof.clear();

     tls_->dag_prof.start();

     tls_->dag_prof.increment_strand_count();

   }


   template <typename T>

   T join(thread_handler<T>& th) {

     common::profiler::switch_phase<prof_phase_thread, prof_phase_sched_join>();


     thread_retval<T> retval;

     if (th.serialized) {

       common::verbose<2>("Skip join for serialized thread (fast path)");

       // We can skip deallocaton for its thread state because it has been already deallocated

       // when the thread is serialized (i.e., at a fork)

       retval = std::move(th.retval_ser);


     } else {

       on_task_die();


       ITYR_CHECK(th.state != nullptr);

       thread_state<T>* ts = th.state;


       if (remote_get_value(thread_state_allocator_, &ts->resume_flag) >= 1) {

         common::verbose("Thread %p is already joined", ts);

         if constexpr (!std::is_same_v<T, no_retval_t> || dag_profiler::enabled) {

           retval = get_retval_remote(ts);

         }


       } else {

         bool migrated = true;

         suspend([&](context_frame* cf) {

           suspended_state ss = evacuate(cf);


           remote_put_value(thread_state_allocator_, ss, &ts->suspended);


           // race

           if (remote_faa_value(thread_state_allocator_, 1, &ts->resume_flag) == 0) {

             common::verbose("Win the join race for thread %p (joining thread)", ts);

             common::profiler::switch_phase<prof_phase_sched_join, prof_phase_sched_loop>();

             resume_sched();

           } else {

             common::verbose("Lose the join race for thread %p (joining thread)", ts);

             suspended_thread_allocator_.deallocate(ss.evacuation_ptr, ss.frame_size);

             migrated = false;

           }

         });


         common::verbose("Resume continuation of join for thread %p", ts);


         if (migrated) {

           common::profiler::switch_phase<prof_phase_sched_resume_join, prof_phase_sched_join>();

         }


         if constexpr (!std::is_same_v<T, no_retval_t> || dag_profiler::enabled) {

           retval = get_retval_remote(ts);

         }

       }


       // TODO: correctly destroy T remotely if nontrivially destructible

       /* std::destroy_at(ts); */


       thread_state_allocator_.deallocate(ts, sizeof(thread_state<T>));

       th.state = nullptr;

     }


     if (tls_->tgdata) {

       tls_->tgdata->dag_prof_acc.merge_parallel(retval.dag_prof);

     }


     common::profiler::switch_phase<prof_phase_sched_join, prof_phase_thread>();

     return std::move(retval.value);

   }


   template <typename SchedLoopCallback>

   void sched_loop(SchedLoopCallback cb) {

     common::verbose("Enter scheduling loop");


     while (!should_exit_sched_loop()) {

       auto mte = migration_mailbox_.pop();

       if (mte.has_value()) {

         execute_migrated_task(*mte);

         continue;

       }


       steal();


       if constexpr (!std::is_null_pointer_v<std::remove_reference_t<SchedLoopCallback>>) {

         cb();

       }

     }


     common::verbose("Exit scheduling loop");

   }


   template <typename PreSuspendCallback, typename PostSuspendCallback>

   void poll(PreSuspendCallback&&, PostSuspendCallback&&) {}


   template <typename PreSuspendCallback, typename PostSuspendCallback>

   void migrate_to(common::topology::rank_t target_rank,

                   PreSuspendCallback&&     pre_suspend_cb,

                   PostSuspendCallback&&    post_suspend_cb) {

     // Currently only for the root thread

     ITYR_CHECK(is_executing_root());


     if (target_rank == common::topology::my_rank()) return;


     auto cb_ret = call_with_prof_events<prof_phase_thread,

                                         prof_phase_cb_pre_suspend,

                                         prof_phase_sched_migrate>(

         std::forward<PreSuspendCallback>(pre_suspend_cb));


     suspend([&](context_frame* cf) {

       suspended_state ss = evacuate(cf);


       common::verbose("Migrate continuation of the root thread to process %d",

                       target_rank);


       migration_mailbox_.put(ss, target_rank);


       common::profiler::switch_phase<prof_phase_sched_migrate, prof_phase_sched_loop>();

       resume_sched();

     });


     call_with_prof_events<prof_phase_sched_resume_migrate,

                           prof_phase_cb_post_suspend,

                           prof_phase_thread>(

         std::forward<PostSuspendCallback>(post_suspend_cb), cb_ret);

   }


   template <typename Fn>

   void coll_exec(const Fn& fn) {

     common::profiler::switch_phase<prof_phase_thread, prof_phase_spmd>();


     tls_->dag_prof.stop();

     // TODO: consider dag prof for inside coll tasks


     using callable_task_t = callable_task<Fn>;


     size_t task_size = sizeof(callable_task_t);

     void* task_ptr = suspended_thread_allocator_.allocate(task_size);


     auto t = new (task_ptr) callable_task_t(fn);


     coll_task ct {task_ptr, task_size, common::topology::my_rank()};

     execute_coll_task(t, ct);


     suspended_thread_allocator_.deallocate(t, task_size);


     tls_->dag_prof.start();

     tls_->dag_prof.increment_strand_count();


     common::profiler::switch_phase<prof_phase_spmd, prof_phase_thread>();

   }


   bool is_executing_root() const {

     return cf_top_ && cf_top_ == stack_base_;

   }


   template <typename T>

   static bool is_serialized(const thread_handler<T>& th) {

     return th.serialized;

   }


   void task_group_begin(task_group_data* tgdata) {

     tls_->dag_prof.stop();


     tgdata->parent          = tls_->tgdata;

     tgdata->dag_prof_before = tls_->dag_prof;


     tls_->tgdata = tgdata;


     tls_->dag_prof.clear();

     tls_->dag_prof.start();

     tls_->dag_prof.increment_strand_count();

   }


   template <typename PreSuspendCallback, typename PostSuspendCallback>

   void task_group_end(PreSuspendCallback&&, PostSuspendCallback&&) {

     on_task_die();


     task_group_data* tgdata = tls_->tgdata;

     ITYR_CHECK(tgdata);


     tls_->dag_prof = tgdata->dag_prof_before;

     tls_->dag_prof.merge_serial(tgdata->dag_prof_acc);


     tls_->tgdata = tls_->tgdata->parent;


     tls_->dag_prof.start();

     tls_->dag_prof.increment_strand_count();

   }


   void dag_prof_begin() {

     dag_prof_enabled_ = true;

     dag_prof_result_.clear();

     if (tls_) {

       // nested root/coll_exec()

       tls_->dag_prof.clear();

       tls_->dag_prof.increment_thread_count();

     }

   }


   void dag_prof_end() {

     dag_prof_enabled_ = false;

     if constexpr (dag_profiler::enabled) {

       common::topology::rank_t result_owner = 0;

       if (tls_) {

         // nested root/coll_exec()

         dag_prof_result_ = tls_->dag_prof;

         result_owner = common::topology::my_rank();

       }

       result_owner = common::mpi_allreduce_value(result_owner, common::topology::mpicomm(), MPI_MAX);

       dag_prof_result_ = common::mpi_bcast_value(dag_prof_result_, result_owner, common::topology::mpicomm());

     }

   }


   void dag_prof_print() const {

     if (common::topology::my_rank() == 0) {

       dag_prof_result_.print();

     }

   }


 private:

   struct coll_task {

     void*                    task_ptr;

     std::size_t              task_size;

     common::topology::rank_t master_rank;

   };


   void on_task_die() {

     if (!tls_->dag_prof.is_stopped()) {

       tls_->dag_prof.stop();

       if (tls_->tgdata) {

         tls_->tgdata->dag_prof_acc.merge_parallel(tls_->dag_prof);

       }

     }

   }


   template <typename T, typename OnDriftDieCallback>

   void on_die(thread_state<T>* ts, T&& ret, OnDriftDieCallback on_drift_die_cb) {

     auto qe = wsq_.pop();

     bool serialized = qe.has_value();


     if (serialized) {

       return;

     }


     call_with_prof_events<prof_phase_sched_die,

                           prof_phase_cb_drift_die,

                           prof_phase_sched_die>(on_drift_die_cb);


     if constexpr (!std::is_same_v<T, no_retval_t> || dag_profiler::enabled) {

       put_retval_remote(ts, {std::move(ret), tls_->dag_prof});

     }


     // race

     if (remote_faa_value(thread_state_allocator_, 1, &ts->resume_flag) == 0) {

       common::verbose("Win the join race for thread %p (joined thread)", ts);

       common::profiler::switch_phase<prof_phase_sched_die, prof_phase_sched_loop>();

       resume_sched();

     } else {

       common::verbose("Lose the join race for thread %p (joined thread)", ts);

       common::profiler::switch_phase<prof_phase_sched_die, prof_phase_sched_resume_join>();

       suspended_state ss = remote_get_value(thread_state_allocator_, &ts->suspended);

       resume(ss);

     }

   }


   template <typename T>

   void on_root_die(thread_state<T>* ts, T&& ret) {

     if constexpr (!std::is_same_v<T, no_retval_t> || dag_profiler::enabled) {

       put_retval_remote(ts, {std::move(ret), tls_->dag_prof});

     }

     remote_put_value(thread_state_allocator_, 1, &ts->resume_flag);


     exit_request_mailbox_.put(0);


     common::profiler::switch_phase<prof_phase_sched_die, prof_phase_sched_loop>();

     resume_sched();

   }


   void steal() {

     auto target_rank = get_random_rank(0, common::topology::n_ranks() - 1);


     auto ibd = common::profiler::interval_begin<prof_event_sched_steal>(target_rank);


     if (wsq_.empty(target_rank)) {

       common::profiler::interval_end<prof_event_sched_steal>(ibd, false);

       return;

     }


     if (!wsq_.lock().trylock(target_rank)) {

       common::profiler::interval_end<prof_event_sched_steal>(ibd, false);

       return;

     }


     auto we = wsq_.steal_nolock(target_rank);

     if (!we.has_value()) {

       wsq_.lock().unlock(target_rank);

       common::profiler::interval_end<prof_event_sched_steal>(ibd, false);

       return;

     }


     common::verbose("Steal context frame [%p, %p) from rank %d",

                     we->frame_base, reinterpret_cast<std::byte*>(we->frame_base) + we->frame_size, target_rank);


     stack_.direct_copy_from(we->frame_base, we->frame_size, target_rank);


     wsq_.lock().unlock(target_rank);


     common::profiler::interval_end<prof_event_sched_steal>(ibd, true);


     common::profiler::switch_phase<prof_phase_sched_loop, prof_phase_sched_resume_stolen>();


     context_frame* next_cf = reinterpret_cast<context_frame*>(we->frame_base);

     suspend([&](context_frame* cf) {

       sched_cf_ = cf;

       context::clear_parent_frame(next_cf);

       resume(next_cf);

     });

   }


   template <typename Fn>

   void suspend(Fn&& fn) {

     context_frame*        prev_cf_top = cf_top_;

     thread_local_storage* prev_tls    = tls_;


     context::save_context_with_call(prev_cf_top,

         [](context_frame* cf, void* cf_top_p, void* fn_p) {

       context_frame*& cf_top = *reinterpret_cast<context_frame**>(cf_top_p);

       Fn              fn     = std::forward<Fn>(*reinterpret_cast<Fn*>(fn_p)); // copy closure to the new stack frame

       cf_top = cf;

       fn(cf);

     }, &cf_top_, &fn, prev_tls);


     cf_top_ = prev_cf_top;

     tls_    = prev_tls;

   }


   void resume(context_frame* cf) {

     common::verbose("Resume context frame [%p, %p) in the stack", cf, cf->parent_frame);

     context::resume(cf);

   }


   void resume(suspended_state ss) {

     common::verbose("Resume context frame [%p, %p) evacuated at %p",

                     ss.frame_base, ss.frame_size, ss.evacuation_ptr);


     // We pass the suspended thread states *by value* because the current local variables can be overwritten by the

     // new stack we will bring from remote nodes.

     context::jump_to_stack(ss.frame_base, [](void* allocator_, void* evacuation_ptr, void* frame_base, void* frame_size_) {

       common::remotable_resource& allocator  = *reinterpret_cast<common::remotable_resource*>(allocator_);

       std::size_t                 frame_size = reinterpret_cast<std::size_t>(frame_size_);

       common::remote_get(allocator,

                          reinterpret_cast<std::byte*>(frame_base),

                          reinterpret_cast<std::byte*>(evacuation_ptr),

                          frame_size);

       allocator.deallocate(evacuation_ptr, frame_size);


       context_frame* cf = reinterpret_cast<context_frame*>(frame_base);

       context::clear_parent_frame(cf);

       context::resume(cf);

     }, &suspended_thread_allocator_, ss.evacuation_ptr, ss.frame_base, reinterpret_cast<void*>(ss.frame_size));

   }


   void resume_sched() {

     common::verbose("Resume scheduler context");

     context::resume(sched_cf_);

   }


   void execute_migrated_task(const suspended_state& ss) {

     ITYR_CHECK(ss.evacuation_ptr);

     common::verbose("Received a continuation of the root thread");

     common::profiler::switch_phase<prof_phase_sched_loop, prof_phase_sched_resume_migrate>();


     suspend([&](context_frame* cf) {

       sched_cf_ = cf;

       resume(ss);

     });

   }


   suspended_state evacuate(context_frame* cf) {

     std::size_t cf_size = reinterpret_cast<uintptr_t>(cf->parent_frame) - reinterpret_cast<uintptr_t>(cf);

     void* evacuation_ptr = suspended_thread_allocator_.allocate(cf_size);

     std::memcpy(evacuation_ptr, cf, cf_size);


     common::verbose("Evacuate suspended thread context [%p, %p) to %p",

                     cf, cf->parent_frame, evacuation_ptr);


     return {evacuation_ptr, cf, cf_size};

   }


   template <typename Fn>

   void root_on_stack(Fn&& fn) {

     cf_top_ = stack_base_;

     std::size_t stack_size_bytes = reinterpret_cast<std::byte*>(stack_base_) -

                                    reinterpret_cast<std::byte*>(stack_.top());

     context::call_on_stack(stack_.top(), stack_size_bytes,

                            [](void* fn_, void*, void*, void*) {

       Fn fn = std::forward<Fn>(*reinterpret_cast<Fn*>(fn_)); // copy closure to the new stack frame

       fn();

     }, &fn, nullptr, nullptr, nullptr);

   }


   void execute_coll_task(task_general* t, coll_task ct) {

     // TODO: consider copy semantics for tasks

     coll_task ct_ {t, ct.task_size, ct.master_rank};


     // pass coll task to other processes in a binary tree form

     auto n_ranks = common::topology::n_ranks();

     auto my_rank = common::topology::my_rank();

     auto my_rank_shifted = (my_rank + n_ranks - ct.master_rank) % n_ranks;

     for (common::topology::rank_t i = common::next_pow2(n_ranks); i > 1; i /= 2) {

       if (my_rank_shifted % i == 0) {

         auto target_rank_shifted = my_rank_shifted + i / 2;

         if (target_rank_shifted < n_ranks) {

           auto target_rank = (target_rank_shifted + ct.master_rank) % n_ranks;

           coll_task_mailbox_.put(ct_, target_rank);

         }

       }

     }


     auto prev_stack_base = stack_base_;

     if (my_rank == ct.master_rank) {

       // Allocate half the rest of the stack space for nested root/coll_exec()

       stack_base_ = cf_top_ - (cf_top_ - reinterpret_cast<context_frame*>(stack_.top())) / 2;

     }


     // In addition, collectively set the next stack base for nested root_exec() calls because

     // the stack frame of the scheduler of the master worker is in the RDMA-capable stack region.

     // TODO: check if the scheduler's stack frame and nested root_exec()'s stack frame do not overlap

     stack_base_ = common::mpi_bcast_value(stack_base_, ct.master_rank, common::topology::mpicomm());


     // Ensure all processes have finished coll task execution before deallocation.

     common::mpi_barrier(common::topology::mpicomm());


     t->execute();


     stack_base_ = prev_stack_base;


     // Ensure all processes have finished coll task execution before deallocation

     common::mpi_barrier(common::topology::mpicomm());

   }


   void execute_coll_task_if_arrived() {

     auto ct = coll_task_mailbox_.pop();

     if (ct.has_value()) {

       task_general* t = reinterpret_cast<task_general*>(

           suspended_thread_allocator_.allocate(ct->task_size));


       common::remote_get(suspended_thread_allocator_,

                          reinterpret_cast<std::byte*>(t),

                          reinterpret_cast<std::byte*>(ct->task_ptr),

                          ct->task_size);


       common::profiler::switch_phase<prof_phase_sched_loop, prof_phase_spmd>();


       execute_coll_task(t, *ct);


       common::profiler::switch_phase<prof_phase_spmd, prof_phase_sched_loop>();


       suspended_thread_allocator_.deallocate(t, ct->task_size);

     }

   }


   bool should_exit_sched_loop() {

     if (sched_loop_make_mpi_progress_option::value()) {

       common::mpi_make_progress();

     }


     execute_coll_task_if_arrived();


     if (exit_request_mailbox_.pop()) {

       auto my_rank = common::topology::my_rank();

       auto n_ranks = common::topology::n_ranks();

       for (common::topology::rank_t i = common::next_pow2(n_ranks); i > 1; i /= 2) {

         if (my_rank % i == 0) {

           auto target_rank = my_rank + i / 2;

           if (target_rank < n_ranks) {

             exit_request_mailbox_.put(target_rank);

           }

         }

       }

       return true;

     }


     return false;

   }


   template <typename T>

   thread_retval<T> get_retval_remote(thread_state<T>* ts) {

     if constexpr (std::is_trivially_copyable_v<T>) {

       return remote_get_value(thread_state_allocator_, &ts->retval);

     } else {

       // TODO: Fix this ugly hack of avoiding object destruction by using checkout/checkin

       thread_retval<T> retval;

       remote_get(thread_state_allocator_, reinterpret_cast<std::byte*>(&retval), reinterpret_cast<std::byte*>(&ts->retval), sizeof(thread_retval<T>));

       return retval;

     }

   }


   template <typename T>

   void put_retval_remote(thread_state<T>* ts, thread_retval<T>&& retval) {

     if constexpr (std::is_trivially_copyable_v<T>) {

       remote_put_value(thread_state_allocator_, retval, &ts->retval);

     } else {

       // TODO: Fix this ugly hack of avoiding object destruction by using checkout/checkin

       std::byte* retvalp = reinterpret_cast<std::byte*>(new (alloca(sizeof(thread_retval<T>))) thread_retval<T>{std::move(retval)});

       remote_put(thread_state_allocator_, retvalp, reinterpret_cast<std::byte*>(&ts->retval), sizeof(thread_retval<T>));

     }

   }


   struct wsqueue_entry {

     void*       frame_base;

     std::size_t frame_size;

   };


   callstack                        stack_;

   context_frame*                   stack_base_;

   oneslot_mailbox<void>            exit_request_mailbox_;

   oneslot_mailbox<coll_task>       coll_task_mailbox_;

   oneslot_mailbox<suspended_state> migration_mailbox_;

   wsqueue<wsqueue_entry>           wsq_;

   common::remotable_resource       thread_state_allocator_;

   common::remotable_resource       suspended_thread_allocator_;

   context_frame*                   cf_top_           = nullptr;

   context_frame*                   sched_cf_         = nullptr;

   thread_local_storage*            tls_              = nullptr;

   bool                             dag_prof_enabled_ = false;

   dag_profiler                     dag_prof_result_;

 };


 }

allocator.hpp

callstack.hpp

ityr::common::global_lock::unlock
void unlock(topology::rank_t target_rank, int idx=0) const
Definition: global_lock.hpp:53

ityr::common::global_lock::trylock
bool trylock(topology::rank_t target_rank, int idx=0) const
Definition: global_lock.hpp:21

ityr::common::option< sched_loop_make_mpi_progress_option, bool >::value
static value_type value()
Definition: options.hpp:62

ityr::ito::callable_task
Definition: util.hpp:115

ityr::ito::callstack::direct_copy_from
void direct_copy_from(void *addr, std::size_t size, common::topology::rank_t target_rank) const
Definition: callstack.hpp:25

ityr::ito::callstack::top
void * top() const
Definition: callstack.hpp:21

ityr::ito::oneslot_mailbox< void >::put
void put(common::topology::rank_t target_rank)
Definition: util.hpp:266

ityr::ito::oneslot_mailbox< void >::pop
bool pop()
Definition: util.hpp:273

ityr::ito::oneslot_mailbox::pop
std::optional< Entry > pop()
Definition: util.hpp:237

ityr::ito::oneslot_mailbox::put
void put(const Entry &entry, common::topology::rank_t target_rank)
Definition: util.hpp:229

ityr::ito::scheduler_randws
Definition: randws.hpp:20

ityr::ito::scheduler_randws::scheduler_randws
scheduler_randws()
Definition: randws.hpp:59

ityr::ito::scheduler_randws::root_exec
T root_exec(SchedLoopCallback cb, Fn &&fn, Args &&... args)
Definition: randws.hpp:70

ityr::ito::scheduler_randws::sched_loop
void sched_loop(SchedLoopCallback cb)
Definition: randws.hpp:258

ityr::ito::scheduler_randws::join
T join(thread_handler< T > &th)
Definition: randws.hpp:190

ityr::ito::scheduler_randws::dag_prof_end
void dag_prof_end()
Definition: randws.hpp:386

ityr::ito::scheduler_randws::fork
void fork(thread_handler< T > &th, OnDriftForkCallback on_drift_fork_cb, OnDriftDieCallback on_drift_die_cb, WorkHint, WorkHint, Fn &&fn, Args &&... args)
Definition: randws.hpp:128

ityr::ito::scheduler_randws::task_group_begin
void task_group_begin(task_group_data *tgdata)
Definition: randws.hpp:347

ityr::ito::scheduler_randws::is_serialized
static bool is_serialized(const thread_handler< T > &th)
Definition: randws.hpp:343

ityr::ito::scheduler_randws::migrate_to
void migrate_to(common::topology::rank_t target_rank, PreSuspendCallback &&pre_suspend_cb, PostSuspendCallback &&post_suspend_cb)
Definition: randws.hpp:282

ityr::ito::scheduler_randws::poll
void poll(PreSuspendCallback &&, PostSuspendCallback &&)
Definition: randws.hpp:279

ityr::ito::scheduler_randws::is_executing_root
bool is_executing_root() const
Definition: randws.hpp:338

ityr::ito::scheduler_randws::coll_exec
void coll_exec(const Fn &fn)
Definition: randws.hpp:314

ityr::ito::scheduler_randws::dag_prof_print
void dag_prof_print() const
Definition: randws.hpp:400

ityr::ito::scheduler_randws::dag_prof_begin
void dag_prof_begin()
Definition: randws.hpp:376

ityr::ito::scheduler_randws::task_group_end
void task_group_end(PreSuspendCallback &&, PostSuspendCallback &&)
Definition: randws.hpp:361

ityr::ito::wsqueue::pop
std::optional< Entry > pop(int idx=0)
Definition: wsqueue.hpp:67

ityr::ito::wsqueue::steal_nolock
std::optional< Entry > steal_nolock(common::topology::rank_t target_rank, int idx=0)
Definition: wsqueue.hpp:158

ityr::ito::wsqueue::push
void push(const Entry &entry, int idx=0)
Definition: wsqueue.hpp:36

ityr::ito::wsqueue::lock
const common::global_lock & lock() const
Definition: wsqueue.hpp:304

ityr::ito::wsqueue::empty
bool empty(common::topology::rank_t target_rank, int idx=0) const
Definition: wsqueue.hpp:271

util.hpp

ITYR_CHECK
#define ITYR_CHECK(cond)
Definition: util.hpp:48

context.hpp

options.hpp

prof_events.hpp

util.hpp

util.hpp

logger.hpp

mpi_rma.hpp

mpi_util.hpp

ityr::common::numa::enabled
bool enabled()
Definition: numa.hpp:86

ityr::common::topology::n_ranks
rank_t n_ranks()
Definition: topology.hpp:208

ityr::common::topology::rank_t
int rank_t
Definition: topology.hpp:12

ityr::common::topology::mpicomm
MPI_Comm mpicomm()
Definition: topology.hpp:206

ityr::common::topology::my_rank
rank_t my_rank()
Definition: topology.hpp:207

ityr::common::remote_get
void remote_get(const remotable_resource &rmr, T *origin_p, const T *target_p, std::size_t size)
Definition: allocator.hpp:404

ityr::common::remote_put_value
void remote_put_value(const remotable_resource &rmr, const T &val, T *target_p)
Definition: allocator.hpp:434

ityr::common::args
va_list args
Definition: util.hpp:76

ityr::common::remote_faa_value
T remote_faa_value(const remotable_resource &rmr, const T &val, T *target_p)
Definition: allocator.hpp:444

ityr::common::mpi_make_progress
void mpi_make_progress()
Definition: mpi_util.hpp:260

ityr::common::mpi_bcast_value
T mpi_bcast_value(const T &value, int root_rank, MPI_Comm comm)
Definition: mpi_util.hpp:145

ityr::common::mpi_allreduce_value
T mpi_allreduce_value(const T &value, MPI_Comm comm, MPI_Op op=MPI_SUM)
Definition: mpi_util.hpp:194

ityr::common::remote_put
void remote_put(const remotable_resource &rmr, const T *origin_p, T *target_p, std::size_t size)
Definition: allocator.hpp:424

ityr::common::remote_get_value
T remote_get_value(const remotable_resource &rmr, const T *target_p)
Definition: allocator.hpp:414

ityr::common::mpi_barrier
void mpi_barrier(MPI_Comm comm)
Definition: mpi_util.hpp:42

ityr::common::next_pow2
uint64_t next_pow2(uint64_t x)
Definition: util.hpp:102

ityr::common::verbose
void verbose(const char *fmt,...)
Definition: logger.hpp:11

ityr::ito
Definition: aarch64.hpp:5

ityr::ito::call_with_prof_events
auto call_with_prof_events(Fn &&fn, Args &&... args)
Definition: util.hpp:182

ityr::ito::dag_profiler
ITYR_CONCAT(dag_profiler_, ITYR_ITO_DAG_PROF) dag_profiler
Definition: util.hpp:102

ityr::ito::get_random_rank
common::topology::rank_t get_random_rank(common::topology::rank_t a, common::topology::rank_t b)
Definition: util.hpp:128

ityr::my_rank
rank_t my_rank()
Return the rank of the process running the current thread.
Definition: ityr.hpp:99

ityr::n_ranks
rank_t n_ranks()
Return the total number of processes.
Definition: ityr.hpp:107

ityr::move
ForwardIteratorD move(const ExecutionPolicy &policy, ForwardIterator1 first1, ForwardIterator1 last1, ForwardIteratorD first_d)
Move a range to another.
Definition: parallel_loop.hpp:934

profiler.hpp

ityr::ito::prof_phase_cb_drift_fork
Definition: prof_events.hpp:190

ityr::ito::prof_phase_cb_post_suspend
Definition: prof_events.hpp:205

ityr::ito::prof_phase_cb_pre_suspend
Definition: prof_events.hpp:200

ityr::ito::prof_phase_sched_migrate
Definition: prof_events.hpp:155

ityr::ito::prof_phase_sched_resume_migrate
Definition: prof_events.hpp:180

ityr::ito::prof_phase_sched_resume_stolen
Definition: prof_events.hpp:175

ityr::ito::prof_phase_thread
Definition: prof_events.hpp:210

ityr::ito::scheduler_randws::suspended_state
Definition: randws.hpp:22

ityr::ito::scheduler_randws::suspended_state::evacuation_ptr
void * evacuation_ptr
Definition: randws.hpp:23

ityr::ito::scheduler_randws::suspended_state::frame_size
std::size_t frame_size
Definition: randws.hpp:25

ityr::ito::scheduler_randws::suspended_state::frame_base
void * frame_base
Definition: randws.hpp:24

ityr::ito::scheduler_randws::task_group_data
Definition: randws.hpp:48

ityr::ito::scheduler_randws::task_group_data::dag_prof_acc
dag_profiler dag_prof_acc
Definition: randws.hpp:51

ityr::ito::scheduler_randws::task_group_data::dag_prof_before
dag_profiler dag_prof_before
Definition: randws.hpp:50

ityr::ito::scheduler_randws::task_group_data::parent
task_group_data * parent
Definition: randws.hpp:49

ityr::ito::scheduler_randws::thread_handler
Definition: randws.hpp:42

ityr::ito::scheduler_randws::thread_handler::serialized
bool serialized
Definition: randws.hpp:44

ityr::ito::scheduler_randws::thread_handler::state
thread_state< T > * state
Definition: randws.hpp:43

ityr::ito::scheduler_randws::thread_handler::retval_ser
thread_retval< T > retval_ser
Definition: randws.hpp:45

ityr::ito::scheduler_randws::thread_local_storage
Definition: randws.hpp:54

ityr::ito::scheduler_randws::thread_local_storage::tgdata
task_group_data * tgdata
Definition: randws.hpp:55

ityr::ito::scheduler_randws::thread_local_storage::dag_prof
dag_profiler dag_prof
Definition: randws.hpp:56

ityr::ito::scheduler_randws::thread_retval
Definition: randws.hpp:29

ityr::ito::scheduler_randws::thread_retval::dag_prof
dag_profiler dag_prof
Definition: randws.hpp:31

ityr::ito::scheduler_randws::thread_retval::value
T value
Definition: randws.hpp:30

ityr::ito::scheduler_randws::thread_state
Definition: randws.hpp:35

ityr::ito::scheduler_randws::thread_state::resume_flag
int resume_flag
Definition: randws.hpp:37

ityr::ito::scheduler_randws::thread_state::retval
thread_retval< T > retval
Definition: randws.hpp:36

ityr::ito::scheduler_randws::thread_state::suspended
suspended_state suspended
Definition: randws.hpp:38

ityr::ito::stack_size_option
Definition: options.hpp:20

ityr::ito::suspended_thread_allocator_size_option
Definition: options.hpp:38

ityr::ito::thread_state_allocator_size_option
Definition: options.hpp:32

ityr::ito::wsqueue_capacity_option
Definition: options.hpp:26

topology.hpp

wsqueue.hpp