From e0bfed06192860de9cf6015717cfddf627aa8441 Mon Sep 17 00:00:00 2001
From: Carlos Segarra <carlos@carlossegarra.com>
Date: Wed, 8 May 2024 16:08:59 +0000
Subject: [PATCH] threads: handle zero-sized requests and preload a la mpi

---
 src/batch-scheduler/BinPackScheduler.cpp   |  7 +++
 src/batch-scheduler/SchedulingDecision.cpp |  2 +-
 src/planner/Planner.cpp                    | 73 ++++++++++++++--------
 src/planner/PlannerClient.cpp              | 13 ++--
 src/proto/faabric.proto                    |  4 ++
 src/scheduler/Scheduler.cpp                |  4 ++
 6 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/src/batch-scheduler/BinPackScheduler.cpp b/src/batch-scheduler/BinPackScheduler.cpp
index 452435ab7..ef5733964 100644
--- a/src/batch-scheduler/BinPackScheduler.cpp
+++ b/src/batch-scheduler/BinPackScheduler.cpp
@@ -307,6 +307,13 @@ std::shared_ptr<SchedulingDecision> BinPackScheduler::makeSchedulingDecision(
     auto decisionType = getDecisionType(inFlightReqs, req);
     auto sortedHosts = getSortedHosts(hostMap, inFlightReqs, req, decisionType);
 
+    // For an OpenMP request with the single host hint, we only consider
+    // scheduling in one VM
+    bool isOmp = req->messages_size() > 0 && req->messages(0).isomp();
+    if (req->singlehosthint() && isOmp) {
+        sortedHosts.erase(sortedHosts.begin() + 1, sortedHosts.end());
+    }
+
     // Assign slots from the list (i.e. bin-pack)
     auto it = sortedHosts.begin();
     int numLeftToSchedule = req->messages_size();
diff --git a/src/batch-scheduler/SchedulingDecision.cpp b/src/batch-scheduler/SchedulingDecision.cpp
index 31bd97351..9c953c605 100644
--- a/src/batch-scheduler/SchedulingDecision.cpp
+++ b/src/batch-scheduler/SchedulingDecision.cpp
@@ -15,7 +15,7 @@ bool SchedulingDecision::isSingleHost() const
 
     std::string thisHost = conf.endpointHost;
     std::set<std::string> hostSet(hosts.begin(), hosts.end());
-    return hostSet.size() == 1;
+    return hostSet.size() <= 1;
 }
 
 void SchedulingDecision::addMessage(const std::string& host,
diff --git a/src/planner/Planner.cpp b/src/planner/Planner.cpp
index c5b106cec..bc4017422 100644
--- a/src/planner/Planner.cpp
+++ b/src/planner/Planner.cpp
@@ -19,7 +19,7 @@
 
 // Special group ID magic to indicate MPI decisions that we have preemptively
 // scheduled
-#define MPI_PRELOADED_DECISION_GROUPID -99
+#define FIXED_SIZE_PRELOADED_DECISION_GROUPID -99
 
 namespace faabric::planner {
 
@@ -793,7 +793,7 @@ Planner::callBatch(std::shared_ptr<BatchExecuteRequest> req)
 
         int numAvail = availableSlots(state.hostMap.at(mainHost));
         int numRequested = req->messages_size();
-        int lastMsgIdx = req->messages(numRequested - 1).groupidx();
+        int lastMsgIdx = numRequested == 0 ? 0 : req->messages(numRequested - 1).groupidx();
         for (int itr  = 0; itr < (numAvail - numRequested); itr++) {
             // Differentiate between the position in the message array (itr)
             // and the new group index. Usually, in a fork, they would be
@@ -802,9 +802,25 @@ Planner::callBatch(std::shared_ptr<BatchExecuteRequest> req)
             SPDLOG_DEBUG("Adding elastically scaled up msg idx {} (app: {})", msgIdx, appId);
 
             // To add a new message, copy from the last, and update the indexes
-            *req->add_messages() = req->messages(numRequested - 1);
-            req->mutable_messages(numRequested + itr)->set_appidx(msgIdx);
-            req->mutable_messages(numRequested + itr)->set_groupidx(msgIdx);
+            if (numRequested == 0) {
+                // This is a special case where we scale up from zero
+                // parallelism (i.e. 1 OpenMP thread) that requires special
+                // care
+                auto* newMsg = req->add_messages();
+                *newMsg = state.inFlightReqs.at(appId).first->messages(0);
+                newMsg->set_mainhost(mainHost);
+                newMsg->set_appidx(msgIdx);
+                newMsg->set_groupidx(msgIdx);
+
+                // For requests that elastically scale from 1 (i.e. zero-
+                // parallelism) we make use of the group id field to pass the
+                // actual function pointer as a hack
+                newMsg->set_funcptr(req->groupid());
+            } else {
+                *req->add_messages() = req->messages(numRequested - 1);
+                req->mutable_messages(numRequested + itr)->set_appidx(msgIdx);
+                req->mutable_messages(numRequested + itr)->set_groupidx(msgIdx);
+            }
 
             // Also update the message id to make sure we can wait-for and
             // clean-up the resources we use
@@ -831,13 +847,14 @@ Planner::callBatch(std::shared_ptr<BatchExecuteRequest> req)
         }
     }
 
-    // For a NEW decision of an MPI application, we know that it will be
-    // followed-up by a SCALE_CHANGE one, and that the mpi_world_size parameter
-    // must be set. Thus, we can schedule slots for all the MPI ranks, and
-    // consume them later as a preloaded scheduling decision
+    // For a NEW decision of an MPI/OpenMP application, we know that it will be
+    // followed-up by a SCALE_CHANGE one, and that the size parameter
+    // must be set. Thus, we can schedule slots for all the MPI ranks/OMP
+    // threads, and consume them later as a preloaded scheduling decision
     bool isNew = decisionType == faabric::batch_scheduler::DecisionType::NEW;
-    bool isMpi = req->messages(0).ismpi();
-    std::shared_ptr<BatchExecuteRequest> mpiReq = nullptr;
+    bool isMpi = req->messages_size() > 0 && req->messages(0).ismpi();
+    bool isOmp = req->messages_size() > 0 && req->messages(0).isomp();
+    std::shared_ptr<BatchExecuteRequest> knownSizeReq = nullptr;
 
     // Check if there exists a pre-loaded scheduling decision for this app
     // (e.g. if we want to force a migration). Note that we don't want to check
@@ -849,25 +866,27 @@ Planner::callBatch(std::shared_ptr<BatchExecuteRequest> req)
         // In general, after a scale change decision (that has been preloaded)
         // it is safe to remove it
         if (isScaleChange) {
+            SPDLOG_DEBUG("Removing pre-loaded scheduling decision for app {}", appId);
             state.preloadedSchedulingDecisions.erase(appId);
         }
-    } else if (isNew && isMpi) {
-        mpiReq = std::make_shared<BatchExecuteRequest>();
-        *mpiReq = *req;
+    } else if (isNew && (isMpi || isOmp)) {
+        knownSizeReq = std::make_shared<BatchExecuteRequest>();
+        *knownSizeReq = *req;
 
         // Deep-copy as many messages we can from the original BER, and mock
         // the rest
-        for (int i = req->messages_size(); i < req->messages(0).mpiworldsize();
-             i++) {
-            auto* newMpiMsg = mpiReq->add_messages();
+        size_t reqSize = isMpi ? req->messages(0).mpiworldsize() : req->messages(0).ompnumthreads();
+        assert(reqSize > 0);
+        for (int i = req->messages_size(); i < reqSize; i++) {
+            auto* newMpiMsg = knownSizeReq->add_messages();
 
             newMpiMsg->set_appid(req->appid());
             newMpiMsg->set_groupidx(i);
         }
-        assert(mpiReq->messages_size() == req->messages(0).mpiworldsize());
+        assert(knownSizeReq->messages_size() == reqSize);
 
         decision = batchScheduler->makeSchedulingDecision(
-          hostMapCopy, state.inFlightReqs, mpiReq);
+          hostMapCopy, state.inFlightReqs, knownSizeReq);
     } else {
         decision = batchScheduler->makeSchedulingDecision(
           hostMapCopy, state.inFlightReqs, req);
@@ -953,7 +972,7 @@ Planner::callBatch(std::shared_ptr<BatchExecuteRequest> req)
     }
 
     // Skip claiming slots and ports if we have preemptively allocated them
-    bool skipClaim = decision->groupId == MPI_PRELOADED_DECISION_GROUPID;
+    bool skipClaim = decision->groupId == FIXED_SIZE_PRELOADED_DECISION_GROUPID;
 
     // A scheduling decision will create a new PTP mapping and, as a
     // consequence, a new group ID
@@ -990,17 +1009,17 @@ Planner::callBatch(std::shared_ptr<BatchExecuteRequest> req)
             // For a NEW MPI decision that was not preloaded we have
             // preemptively scheduled all MPI messages but now we just need to
             // return the first one, and preload the rest
-            if (isMpi && mpiReq != nullptr) {
-                auto mpiDecision = std::make_shared<
+            if ((isMpi || isOmp) && knownSizeReq != nullptr) {
+                auto knownSizeDecision = std::make_shared<
                   faabric::batch_scheduler::SchedulingDecision>(req->appid(),
                                                                 req->groupid());
-                *mpiDecision = *decision;
-                mpiDecision->groupId = MPI_PRELOADED_DECISION_GROUPID;
-                state.preloadedSchedulingDecisions[appId] = mpiDecision;
+                *knownSizeDecision = *decision;
+                knownSizeDecision->groupId = FIXED_SIZE_PRELOADED_DECISION_GROUPID;
+                state.preloadedSchedulingDecisions[appId] = knownSizeDecision;
 
                 // Remove all messages that we do not have to dispatch now
-                for (int i = 1; i < mpiDecision->messageIds.size(); i++) {
-                    decision->removeMessage(mpiDecision->messageIds.at(i));
+                for (int i = 1; i < knownSizeDecision->messageIds.size(); i++) {
+                    decision->removeMessage(knownSizeDecision->messageIds.at(i));
                 }
             }
 
diff --git a/src/planner/PlannerClient.cpp b/src/planner/PlannerClient.cpp
index 0dc3a7e97..237e278d7 100644
--- a/src/planner/PlannerClient.cpp
+++ b/src/planner/PlannerClient.cpp
@@ -307,13 +307,16 @@ faabric::batch_scheduler::SchedulingDecision PlannerClient::callFunctions(
     // to other hosts. Given that we don't support nested threading, if we
     // have a THREADS request here it means that we are being called from the
     // main thread (which holds the main snapshot)
-    const std::string funcStr =
-      faabric::util::funcToString(req->messages(0), false);
+    std::string snapshotKey;
     auto& reg = faabric::snapshot::getSnapshotRegistry();
 
-    std::string snapshotKey;
-    const auto firstMsg = req->messages(0);
-    if (isThreads) {
+    // Note that with threads we may have 0-sized BERs
+    if (isThreads && req->messages_size() > 0) {
+        const std::string funcStr =
+          faabric::util::funcToString(req->messages(0), false);
+
+        const auto firstMsg = req->messages(0);
+
         if (!firstMsg.snapshotkey().empty()) {
             SPDLOG_ERROR("{} should not provide snapshot key for {} threads",
                          funcStr,
diff --git a/src/proto/faabric.proto b/src/proto/faabric.proto
index 56abf85dd..a875305a2 100644
--- a/src/proto/faabric.proto
+++ b/src/proto/faabric.proto
@@ -145,6 +145,10 @@ message Message {
     repeated int32 chainedMsgIds = 36;
     map<string, int32> intExecGraphDetails = 37;
     map<string, string> execGraphDetails = 38;
+
+    // OpenMP
+    bool isOmp = 39;
+    int32 ompNumThreads = 40;
 }
 
 // ---------------------------------------------
diff --git a/src/scheduler/Scheduler.cpp b/src/scheduler/Scheduler.cpp
index 5c16c6666..224710711 100644
--- a/src/scheduler/Scheduler.cpp
+++ b/src/scheduler/Scheduler.cpp
@@ -249,6 +249,10 @@ long Scheduler::getFunctionExecutorCount(const faabric::Message& msg)
 
 void Scheduler::executeBatch(std::shared_ptr<faabric::BatchExecuteRequest> req)
 {
+    if (req->messages_size() == 0) {
+        return;
+    }
+
     faabric::util::FullLock lock(mx);
 
     bool isThreads = req->type() == faabric::BatchExecuteRequest::THREADS;