From 2fae4cc034c6bcb23dc6b7d9eb7a5fd1135cdb58 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Thu, 7 Nov 2024 10:54:52 +0000
Subject: [PATCH] [Flang][OpenMP] Add host_eval clause lowering support (#180)

This patch updates Flang lowering to use the `host_eval` clause in `omp.target`
operations to pass host information into the applicable clauses inside of the
target region, instead of the previous approach where these clauses were
attached to the `omp.target` operation itself.
---
 flang/include/flang/Lower/OpenMP/Utils.h      |   9 -
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 220 +++++++++---------
 flang/lib/Lower/OpenMP/Utils.cpp              |  78 -------
 .../OpenMP/DoConcurrentConversion.cpp         | 210 ++++++-----------
 .../Optimizer/OpenMP/MapInfoFinalization.cpp  |  55 +++--
 .../OpenMP/FIR/mismatched-bound-types.f90     |  14 +-
 .../test/Lower/OpenMP/eval-outside-target.f90 |  85 ++-----
 flang/test/Lower/OpenMP/target-spmd.f90       |  28 +--
 .../Transforms/DoConcurrent/basic_device.f90  |  22 +-
 .../multiple_iteration_ranges.f90             |  77 ++++--
 10 files changed, 332 insertions(+), 466 deletions(-)

diff --git a/flang/include/flang/Lower/OpenMP/Utils.h b/flang/include/flang/Lower/OpenMP/Utils.h
index 7a622e1cb74ee2..f342481781fdd3 100644
--- a/flang/include/flang/Lower/OpenMP/Utils.h
+++ b/flang/include/flang/Lower/OpenMP/Utils.h
@@ -174,15 +174,6 @@ void genObjectList(const ObjectList &objects,
 void lastprivateModifierNotSupported(const omp::clause::Lastprivate &lastp,
                                      mlir::Location loc);
 
-// TODO: consider moving this to the `omp.loop_nest` op. Would be something like
-// this:
-//
-// ```
-// mlir::Value LoopNestOp::calculateTripCount(mlir::OpBuilder &builder,
-// mlir::OpBuilder::InsertPoint ip)
-// ```
-mlir::Value calculateTripCount(fir::FirOpBuilder &builder, mlir::Location loc,
-                               const mlir::omp::LoopRelatedClauseOps &ops);
 } // namespace omp
 } // namespace lower
 } // namespace Fortran
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 418ddc709664ff..ba7d1ecfe38dcb 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -46,6 +46,25 @@ using namespace Fortran::lower::omp;
 // Code generation helper functions
 //===----------------------------------------------------------------------===//
 
+/// Add to the given target operation a host_eval argument, which must be
+/// defined outside.
+///
+/// \return the entry block argument to represent \c hostVar inside of the
+///         target region.
+static mlir::Value addHostEvalVar(mlir::omp::TargetOp targetOp,
+                                  mlir::Value hostVar) {
+  assert(!targetOp.getRegion().isAncestor(hostVar.getParentRegion()) &&
+         "variable must be defined outside of the target region");
+
+  auto argIface = llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*targetOp);
+  unsigned insertIndex =
+      argIface.getHostEvalBlockArgsStart() + argIface.numHostEvalBlockArgs();
+
+  targetOp.getHostEvalVarsMutable().append(hostVar);
+  return targetOp.getRegion().insertArgument(insertIndex, hostVar.getType(),
+                                             hostVar.getLoc());
+}
+
 namespace {
 /// Structure holding the information needed to create and bind entry block
 /// arguments associated to a single clause.
@@ -64,6 +83,7 @@ struct EntryBlockArgsEntry {
 /// Structure holding the information needed to create and bind entry block
 /// arguments associated to all clauses that can define them.
 struct EntryBlockArgs {
+  EntryBlockArgsEntry hostEval;
   EntryBlockArgsEntry inReduction;
   EntryBlockArgsEntry map;
   EntryBlockArgsEntry priv;
@@ -73,8 +93,8 @@ struct EntryBlockArgs {
   EntryBlockArgsEntry useDevicePtr;
 
   bool isValid() const {
-    return inReduction.isValid() && map.isValid() && priv.isValid() &&
-           reduction.isValid() && taskReduction.isValid() &&
+    return hostEval.isValid() && inReduction.isValid() && map.isValid() &&
+           priv.isValid() && reduction.isValid() && taskReduction.isValid() &&
            useDeviceAddr.isValid() && useDevicePtr.isValid();
   }
 
@@ -177,6 +197,18 @@ static bool evalHasSiblings(const lower::pft::Evaluation &eval) {
       }});
 }
 
+/// Check whether the given omp.target operation exists and we're compiling for
+/// the host device.
+static bool isHostTarget(mlir::omp::TargetOp targetOp) {
+  if (!targetOp)
+    return false;
+
+  auto offloadModOp = llvm::cast<mlir::omp::OffloadModuleInterface>(
+      *targetOp->getParentOfType<mlir::ModuleOp>());
+
+  return !offloadModOp.getIsTargetDevice();
+}
+
 /// Check whether a given evaluation points to an OpenMP loop construct that
 /// represents a target SPMD kernel. For this to be true, it must be a `target
 /// teams distribute parallel do [simd]` or equivalent construct.
@@ -184,7 +216,7 @@ static bool evalHasSiblings(const lower::pft::Evaluation &eval) {
 /// Currently, this is limited to cases where all relevant OpenMP constructs are
 /// either combined or directly nested within the same function. Also, the
 /// composite `distribute parallel do` is not identified if split into two
-/// explicit nested loops (a `distribute` loop and a `parallel do` loop).
+/// explicit nested loops (i.e. a `distribute` loop and a `parallel do` loop).
 static bool isTargetSPMDLoop(const lower::pft::Evaluation &eval) {
   using namespace llvm::omp;
 
@@ -391,6 +423,8 @@ static void bindEntryBlockArgs(lower::AbstractConverter &converter,
   };
 
   // Process in clause name alphabetical order to match block arguments order.
+  bindPrivateLike(args.hostEval.syms, args.hostEval.vars,
+                  op.getHostEvalBlockArgs());
   bindPrivateLike(args.inReduction.syms, args.inReduction.vars,
                   op.getInReductionBlockArgs());
   bindMapLike(args.map.syms, op.getMapBlockArgs());
@@ -455,22 +489,26 @@ static void genNestedEvaluations(lower::AbstractConverter &converter,
     converter.genEval(e);
 }
 
-static bool
-mustEvalTeamsThreadsOutsideTarget(const lower::pft::Evaluation &eval,
-                                  mlir::omp::TargetOp targetOp) {
-  if (!targetOp)
-    return false;
-
-  auto offloadModOp = llvm::cast<mlir::omp::OffloadModuleInterface>(
-      *targetOp->getParentOfType<mlir::ModuleOp>());
-  if (offloadModOp.getIsTargetDevice())
+static bool mustEvalTeamsOutsideTarget(const lower::pft::Evaluation &eval,
+                                       mlir::omp::TargetOp targetOp) {
+  if (!isHostTarget(targetOp))
     return false;
 
   llvm::omp::Directive dir =
       extractOmpDirective(eval.get<parser::OpenMPConstruct>());
+
+  assert(llvm::omp::allTeamsSet.test(dir) && "expected a teams construct");
   return llvm::omp::allTargetSet.test(dir) || !evalHasSiblings(eval);
 }
 
+static bool mustEvalTargetSPMDOutsideTarget(const lower::pft::Evaluation &eval,
+                                            mlir::omp::TargetOp targetOp) {
+  if (!isHostTarget(targetOp))
+    return false;
+
+  return isTargetSPMDLoop(eval);
+}
+
 //===----------------------------------------------------------------------===//
 // HostClausesInsertionGuard
 //===----------------------------------------------------------------------===//
@@ -497,6 +535,8 @@ class HostClausesInsertionGuard {
     }
   }
 
+  mlir::omp::TargetOp getTargetOp() const { return targetOp; }
+
 private:
   mlir::OpBuilder &builder;
   mlir::OpBuilder::InsertPoint ip;
@@ -1012,11 +1052,11 @@ static mlir::Block *genEntryBlock(lower::AbstractConverter &converter,
 
   llvm::SmallVector<mlir::Type> types;
   llvm::SmallVector<mlir::Location> locs;
-  unsigned numVars = args.inReduction.vars.size() + args.map.vars.size() +
-                     args.priv.vars.size() + args.reduction.vars.size() +
-                     args.taskReduction.vars.size() +
-                     args.useDeviceAddr.vars.size() +
-                     args.useDevicePtr.vars.size();
+  unsigned numVars =
+      args.hostEval.vars.size() + args.inReduction.vars.size() +
+      args.map.vars.size() + args.priv.vars.size() +
+      args.reduction.vars.size() + args.taskReduction.vars.size() +
+      args.useDeviceAddr.vars.size() + args.useDevicePtr.vars.size();
   types.reserve(numVars);
   locs.reserve(numVars);
 
@@ -1029,6 +1069,7 @@ static mlir::Block *genEntryBlock(lower::AbstractConverter &converter,
 
   // Populate block arguments in clause name alphabetical order to match
   // expected order by the BlockArgOpenMPOpInterface.
+  extractTypeLoc(args.hostEval.vars);
   extractTypeLoc(args.inReduction.vars);
   extractTypeLoc(args.map.vars);
   extractTypeLoc(args.priv.vars);
@@ -1551,10 +1592,29 @@ static void
 genLoopNestClauses(lower::AbstractConverter &converter,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval, const List<Clause> &clauses,
-                   mlir::Location loc, mlir::omp::LoopNestOperands &clauseOps,
+                   mlir::Location loc, bool evalOutsideTarget,
+                   mlir::omp::LoopNestOperands &clauseOps,
                    llvm::SmallVectorImpl<const semantics::Symbol *> &iv) {
   ClauseProcessor cp(converter, semaCtx, clauses);
-  cp.processCollapse(loc, eval, clauseOps, iv);
+
+  // Evaluate loop bounds on the host device, if the operation is defining part
+  // of a target SPMD kernel.
+  if (evalOutsideTarget) {
+    HostClausesInsertionGuard guard(converter.getFirOpBuilder());
+    cp.processCollapse(loc, eval, clauseOps, iv);
+
+    for (unsigned i = 0; i < clauseOps.loopLowerBounds.size(); ++i) {
+      clauseOps.loopLowerBounds[i] =
+          addHostEvalVar(guard.getTargetOp(), clauseOps.loopLowerBounds[i]);
+      clauseOps.loopUpperBounds[i] =
+          addHostEvalVar(guard.getTargetOp(), clauseOps.loopUpperBounds[i]);
+      clauseOps.loopSteps[i] =
+          addHostEvalVar(guard.getTargetOp(), clauseOps.loopSteps[i]);
+    }
+  } else {
+    cp.processCollapse(loc, eval, clauseOps, iv);
+  }
+
   clauseOps.loopInclusive = converter.getFirOpBuilder().getUnitAttr();
 }
 
@@ -1581,20 +1641,20 @@ static void genParallelClauses(
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
     mlir::Location loc, bool evalOutsideTarget,
     mlir::omp::ParallelOperands &clauseOps,
-    mlir::omp::NumThreadsClauseOps &numThreadsClauseOps,
     llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processIf(llvm::omp::Directive::OMPD_parallel, clauseOps);
 
-  // Don't store num_threads clause operators into clauseOps because then they
-  // would always be added to the omp.parallel operation during its creation.
-  // We might need to attach them to the parent omp.target.
+  // Evaluate NUM_THREADS on the host device, if the operation is defining part
+  // of a target SPMD kernel.
   if (evalOutsideTarget) {
     HostClausesInsertionGuard guard(converter.getFirOpBuilder());
-    cp.processNumThreads(stmtCtx, numThreadsClauseOps);
+    if (cp.processNumThreads(stmtCtx, clauseOps))
+      clauseOps.numThreads =
+          addHostEvalVar(guard.getTargetOp(), clauseOps.numThreads);
   } else {
-    cp.processNumThreads(stmtCtx, numThreadsClauseOps);
+    cp.processNumThreads(stmtCtx, clauseOps);
   }
 
   cp.processProcBind(clauseOps);
@@ -1759,8 +1819,6 @@ static void genTeamsClauses(
     lower::StatementContext &stmtCtx, const List<Clause> &clauses,
     mlir::Location loc, bool evalOutsideTarget,
     mlir::omp::TeamsOperands &clauseOps,
-    mlir::omp::NumTeamsClauseOps &numTeamsClauseOps,
-    mlir::omp::ThreadLimitClauseOps &threadLimitClauseOps,
     llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSyms) {
   ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
@@ -1768,16 +1826,18 @@ static void genTeamsClauses(
 
   // Evaluate NUM_TEAMS and THREAD_LIMIT on the host device, if currently inside
   // of an omp.target operation.
-  // Don't store num_teams and thread_limit clause operators into clauseOps
-  // because then they would always be added to the omp.teams operation during
-  // its creation. We might need to attach them to the parent omp.target.
   if (evalOutsideTarget) {
     HostClausesInsertionGuard guard(converter.getFirOpBuilder());
-    cp.processNumTeams(stmtCtx, numTeamsClauseOps);
-    cp.processThreadLimit(stmtCtx, threadLimitClauseOps);
+    if (cp.processNumTeams(stmtCtx, clauseOps))
+      clauseOps.numTeamsUpper =
+          addHostEvalVar(guard.getTargetOp(), clauseOps.numTeamsUpper);
+
+    if (cp.processThreadLimit(stmtCtx, clauseOps))
+      clauseOps.threadLimit =
+          addHostEvalVar(guard.getTargetOp(), clauseOps.threadLimit);
   } else {
-    cp.processNumTeams(stmtCtx, numTeamsClauseOps);
-    cp.processThreadLimit(stmtCtx, threadLimitClauseOps);
+    cp.processNumTeams(stmtCtx, clauseOps);
+    cp.processThreadLimit(stmtCtx, clauseOps);
   }
   cp.processReduction(loc, clauseOps, reductionSyms);
 }
@@ -1864,7 +1924,6 @@ static mlir::omp::LoopNestOp genLoopNestOp(
         std::pair<mlir::omp::BlockArgOpenMPOpInterface, const EntryBlockArgs &>>
         wrapperArgs,
     llvm::omp::Directive directive, DataSharingProcessor &dsp) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
   auto ivCallback = [&](mlir::Operation *op) {
     genLoopVars(op, converter, loc, iv, wrapperArgs);
@@ -1882,26 +1941,6 @@ static mlir::omp::LoopNestOp genLoopNestOp(
           .setGenRegionEntryCb(ivCallback),
       queue, item, clauseOps);
 
-  // Create trip_count if inside of omp.target and this is host compilation.
-  auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(
-      firOpBuilder.getModule().getOperation());
-  auto targetOp = loopNestOp->getParentOfType<mlir::omp::TargetOp>();
-
-  if (offloadMod && !offloadMod.getIsTargetDevice() && isTargetSPMDLoop(eval)) {
-    assert(targetOp && "must have omp.target parent");
-
-    // Lower loop bounds and step, and process collapsing again, putting lowered
-    // values outside of omp.target this time. This enables calculating and
-    // accessing the trip count in the host, which is needed when lowering to
-    // LLVM IR via the OMPIRBuilder.
-    HostClausesInsertionGuard guard(firOpBuilder);
-    mlir::omp::LoopRelatedClauseOps loopRelatedOps;
-    llvm::SmallVector<const semantics::Symbol *> iv;
-    ClauseProcessor cp(converter, semaCtx, item->clauses);
-    cp.processCollapse(loc, eval, loopRelatedOps, iv);
-    targetOp.getTripCountMutable().assign(
-        calculateTripCount(firOpBuilder, loc, loopRelatedOps));
-  }
   return loopNestOp;
 }
 
@@ -1960,7 +1999,6 @@ static mlir::omp::ParallelOp genParallelOp(
     semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
     mlir::Location loc, const ConstructQueue &queue,
     ConstructQueue::const_iterator item, mlir::omp::ParallelOperands &clauseOps,
-    mlir::omp::NumThreadsClauseOps &numThreadsClauseOps,
     const EntryBlockArgs &args, DataSharingProcessor *dsp,
     bool isComposite = false, mlir::omp::TargetOp parentTarget = nullptr) {
   auto genRegionEntryCB = [&](mlir::Operation *op) {
@@ -1983,13 +2021,6 @@ static mlir::omp::ParallelOp genParallelOp(
   auto parallelOp =
       genOpWithBody<mlir::omp::ParallelOp>(genInfo, queue, item, clauseOps);
   parallelOp.setComposite(isComposite);
-  if (numThreadsClauseOps.numThreads) {
-    if (parentTarget)
-      parentTarget.getNumThreadsMutable().assign(
-          numThreadsClauseOps.numThreads);
-    else
-      parallelOp.getNumThreadsMutable().assign(numThreadsClauseOps.numThreads);
-  }
   return parallelOp;
 }
 
@@ -2278,6 +2309,7 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   extractMappedBaseValues(clauseOps.mapVars, mapBaseValues);
 
   EntryBlockArgs args;
+  // TODO: Fill hostEval in advance rather than adding to it later on.
   // TODO: Add in_reduction syms and vars.
   args.map.syms = mapSyms;
   args.map.vars = mapBaseValues;
@@ -2411,15 +2443,12 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
 
   mlir::omp::TargetOp targetOp =
       findParentTargetOp(converter.getFirOpBuilder());
-  bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp);
+  bool evalOutsideTarget = mustEvalTeamsOutsideTarget(eval, targetOp);
 
   mlir::omp::TeamsOperands clauseOps;
-  mlir::omp::NumTeamsClauseOps numTeamsClauseOps;
-  mlir::omp::ThreadLimitClauseOps threadLimitClauseOps;
   llvm::SmallVector<const semantics::Symbol *> reductionSyms;
   genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
-                  evalOutsideTarget, clauseOps, numTeamsClauseOps,
-                  threadLimitClauseOps, reductionSyms);
+                  evalOutsideTarget, clauseOps, reductionSyms);
 
   EntryBlockArgs args;
   // TODO: Add private syms and vars.
@@ -2441,22 +2470,6 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
           .setGenRegionEntryCb(genRegionEntryCB),
       queue, item, clauseOps);
 
-  if (numTeamsClauseOps.numTeamsUpper) {
-    if (evalOutsideTarget)
-      targetOp.getNumTeamsUpperMutable().assign(
-          numTeamsClauseOps.numTeamsUpper);
-    else
-      teamsOp.getNumTeamsUpperMutable().assign(numTeamsClauseOps.numTeamsUpper);
-  }
-
-  if (threadLimitClauseOps.threadLimit) {
-    if (evalOutsideTarget)
-      targetOp.getTeamsThreadLimitMutable().assign(
-          threadLimitClauseOps.threadLimit);
-    else
-      teamsOp.getThreadLimitMutable().assign(threadLimitClauseOps.threadLimit);
-  }
-
   return teamsOp;
 }
 
@@ -2487,7 +2500,7 @@ static void genStandaloneDistribute(lower::AbstractConverter &converter,
   mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
-                     loopNestClauseOps, iv);
+                     /*evalOutsideTarget=*/false, loopNestClauseOps, iv);
 
   EntryBlockArgs distributeArgs;
   distributeArgs.priv.syms = dsp.getDelayedPrivSymbols();
@@ -2522,7 +2535,7 @@ static void genStandaloneDo(lower::AbstractConverter &converter,
   mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
-                     loopNestClauseOps, iv);
+                     /*evalOutsideTarget=*/false, loopNestClauseOps, iv);
 
   EntryBlockArgs wsloopArgs;
   // TODO: Add private syms and vars.
@@ -2545,15 +2558,10 @@ static void genStandaloneParallel(lower::AbstractConverter &converter,
                                   ConstructQueue::const_iterator item) {
   lower::StatementContext stmtCtx;
 
-  mlir::omp::TargetOp targetOp =
-      findParentTargetOp(converter.getFirOpBuilder());
-  bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp);
-
   mlir::omp::ParallelOperands parallelClauseOps;
-  mlir::omp::NumThreadsClauseOps numThreadsClauseOps;
   llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
   genParallelClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
-                     evalOutsideTarget, parallelClauseOps, numThreadsClauseOps,
+                     /*evalOutsideTarget=*/false, parallelClauseOps,
                      parallelReductionSyms);
 
   std::optional<DataSharingProcessor> dsp;
@@ -2572,9 +2580,9 @@ static void genStandaloneParallel(lower::AbstractConverter &converter,
   parallelArgs.reduction.syms = parallelReductionSyms;
   parallelArgs.reduction.vars = parallelClauseOps.reductionVars;
   genParallelOp(converter, symTable, semaCtx, eval, loc, queue, item,
-                parallelClauseOps, numThreadsClauseOps, parallelArgs,
+                parallelClauseOps, parallelArgs,
                 enableDelayedPrivatization ? &dsp.value() : nullptr,
-                /*isComposite=*/false, evalOutsideTarget ? targetOp : nullptr);
+                /*isComposite=*/false);
 }
 
 static void genStandaloneSimd(lower::AbstractConverter &converter,
@@ -2598,7 +2606,7 @@ static void genStandaloneSimd(lower::AbstractConverter &converter,
   mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, item->clauses, loc,
-                     loopNestClauseOps, iv);
+                     /*evalOutsideTarget=*/false, loopNestClauseOps, iv);
 
   EntryBlockArgs simdArgs;
   // TODO: Add private syms and vars.
@@ -2640,14 +2648,13 @@ static void genCompositeDistributeParallelDo(
 
   mlir::omp::TargetOp targetOp =
       findParentTargetOp(converter.getFirOpBuilder());
-  bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp);
+  bool evalOutsideTarget = mustEvalTargetSPMDOutsideTarget(eval, targetOp);
 
   // Create parent omp.parallel first.
   mlir::omp::ParallelOperands parallelClauseOps;
-  mlir::omp::NumThreadsClauseOps numThreadsClauseOps;
   llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
   genParallelClauses(converter, semaCtx, stmtCtx, parallelItem->clauses, loc,
-                     evalOutsideTarget, parallelClauseOps, numThreadsClauseOps,
+                     evalOutsideTarget, parallelClauseOps,
                      parallelReductionSyms);
 
   DataSharingProcessor dsp(converter, semaCtx, doItem->clauses, eval,
@@ -2662,7 +2669,7 @@ static void genCompositeDistributeParallelDo(
   parallelArgs.reduction.syms = parallelReductionSyms;
   parallelArgs.reduction.vars = parallelClauseOps.reductionVars;
   genParallelOp(converter, symTable, semaCtx, eval, loc, queue, parallelItem,
-                parallelClauseOps, numThreadsClauseOps, parallelArgs, &dsp,
+                parallelClauseOps, parallelArgs, &dsp,
                 /*isComposite=*/true, evalOutsideTarget ? targetOp : nullptr);
 
   // Clause processing.
@@ -2678,7 +2685,7 @@ static void genCompositeDistributeParallelDo(
   mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, doItem->clauses, loc,
-                     loopNestClauseOps, iv);
+                     evalOutsideTarget, loopNestClauseOps, iv);
 
   // Operation creation.
   EntryBlockArgs distributeArgs;
@@ -2716,14 +2723,13 @@ static void genCompositeDistributeParallelDoSimd(
 
   mlir::omp::TargetOp targetOp =
       findParentTargetOp(converter.getFirOpBuilder());
-  bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp);
+  bool evalOutsideTarget = mustEvalTargetSPMDOutsideTarget(eval, targetOp);
 
   // Create parent omp.parallel first.
   mlir::omp::ParallelOperands parallelClauseOps;
-  mlir::omp::NumThreadsClauseOps numThreadsClauseOps;
   llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
   genParallelClauses(converter, semaCtx, stmtCtx, parallelItem->clauses, loc,
-                     evalOutsideTarget, parallelClauseOps, numThreadsClauseOps,
+                     evalOutsideTarget, parallelClauseOps,
                      parallelReductionSyms);
 
   DataSharingProcessor dsp(converter, semaCtx, simdItem->clauses, eval,
@@ -2738,7 +2744,7 @@ static void genCompositeDistributeParallelDoSimd(
   parallelArgs.reduction.syms = parallelReductionSyms;
   parallelArgs.reduction.vars = parallelClauseOps.reductionVars;
   genParallelOp(converter, symTable, semaCtx, eval, loc, queue, parallelItem,
-                parallelClauseOps, numThreadsClauseOps, parallelArgs, &dsp,
+                parallelClauseOps, parallelArgs, &dsp,
                 /*isComposite=*/true, evalOutsideTarget ? targetOp : nullptr);
 
   // Clause processing.
@@ -2759,7 +2765,7 @@ static void genCompositeDistributeParallelDoSimd(
   mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, simdItem->clauses, loc,
-                     loopNestClauseOps, iv);
+                     evalOutsideTarget, loopNestClauseOps, iv);
 
   // Operation creation.
   EntryBlockArgs distributeArgs;
@@ -2827,7 +2833,7 @@ static void genCompositeDistributeSimd(lower::AbstractConverter &converter,
   mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, simdItem->clauses, loc,
-                     loopNestClauseOps, iv);
+                     /*evalOutsideTarget=*/false, loopNestClauseOps, iv);
 
   // Operation creation.
   EntryBlockArgs distributeArgs;
@@ -2885,7 +2891,7 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter,
   mlir::omp::LoopNestOperands loopNestClauseOps;
   llvm::SmallVector<const semantics::Symbol *> iv;
   genLoopNestClauses(converter, semaCtx, eval, simdItem->clauses, loc,
-                     loopNestClauseOps, iv);
+                     /*evalOutsideTarget=*/false, loopNestClauseOps, iv);
 
   // Operation creation.
   EntryBlockArgs wsloopArgs;
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index c705275e17ef96..c57cccfea738cb 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -538,84 +538,6 @@ void lastprivateModifierNotSupported(const omp::clause::Lastprivate &lastp,
   }
 }
 
-mlir::Value calculateTripCount(fir::FirOpBuilder &builder, mlir::Location loc,
-                               const mlir::omp::LoopRelatedClauseOps &ops) {
-  using namespace mlir::arith;
-  assert(ops.loopLowerBounds.size() == ops.loopUpperBounds.size() &&
-         ops.loopLowerBounds.size() == ops.loopSteps.size() &&
-         !ops.loopLowerBounds.empty() && "Invalid bounds or step");
-
-  // Get the bit width of an integer-like type.
-  auto widthOf = [](mlir::Type ty) -> unsigned {
-    if (mlir::isa<mlir::IndexType>(ty)) {
-      return mlir::IndexType::kInternalStorageBitWidth;
-    }
-    if (auto tyInt = mlir::dyn_cast<mlir::IntegerType>(ty)) {
-      return tyInt.getWidth();
-    }
-    llvm_unreachable("Unexpected type");
-  };
-
-  // For a type that is either IntegerType or IndexType, return the
-  // equivalent IntegerType. In the former case this is a no-op.
-  auto asIntTy = [&](mlir::Type ty) -> mlir::IntegerType {
-    if (ty.isIndex()) {
-      return mlir::IntegerType::get(ty.getContext(), widthOf(ty));
-    }
-    assert(ty.isIntOrIndex() && "Unexpected type");
-    return mlir::cast<mlir::IntegerType>(ty);
-  };
-
-  // For two given values, establish a common signless IntegerType
-  // that can represent any value of type of x and of type of y,
-  // and return the pair of x, y converted to the new type.
-  auto unifyToSignless =
-      [&](fir::FirOpBuilder &b, mlir::Value x,
-          mlir::Value y) -> std::pair<mlir::Value, mlir::Value> {
-    auto tyX = asIntTy(x.getType()), tyY = asIntTy(y.getType());
-    unsigned width = std::max(widthOf(tyX), widthOf(tyY));
-    auto wideTy = mlir::IntegerType::get(b.getContext(), width,
-                                         mlir::IntegerType::Signless);
-    return std::make_pair(b.createConvert(loc, wideTy, x),
-                          b.createConvert(loc, wideTy, y));
-  };
-
-  // Start with signless i32 by default.
-  auto tripCount = builder.createIntegerConstant(loc, builder.getI32Type(), 1);
-
-  for (auto [origLb, origUb, origStep] :
-       llvm::zip(ops.loopLowerBounds, ops.loopUpperBounds, ops.loopSteps)) {
-    auto tmpS0 = builder.createIntegerConstant(loc, origStep.getType(), 0);
-    auto [step, step0] = unifyToSignless(builder, origStep, tmpS0);
-    auto reverseCond =
-        builder.create<CmpIOp>(loc, CmpIPredicate::slt, step, step0);
-    auto negStep = builder.create<SubIOp>(loc, step0, step);
-    mlir::Value absStep =
-        builder.create<SelectOp>(loc, reverseCond, negStep, step);
-
-    auto [lb, ub] = unifyToSignless(builder, origLb, origUb);
-    auto start = builder.create<SelectOp>(loc, reverseCond, ub, lb);
-    auto end = builder.create<SelectOp>(loc, reverseCond, lb, ub);
-
-    mlir::Value range = builder.create<SubIOp>(loc, end, start);
-    auto rangeCond =
-        builder.create<CmpIOp>(loc, CmpIPredicate::slt, end, start);
-    std::tie(range, absStep) = unifyToSignless(builder, range, absStep);
-    // numSteps = (range /u absStep) + 1
-    auto numSteps = builder.create<AddIOp>(
-        loc, builder.create<DivUIOp>(loc, range, absStep),
-        builder.createIntegerConstant(loc, range.getType(), 1));
-
-    auto trip0 = builder.createIntegerConstant(loc, numSteps.getType(), 0);
-    auto loopTripCount =
-        builder.create<SelectOp>(loc, rangeCond, trip0, numSteps);
-    auto [totalTC, thisTC] = unifyToSignless(builder, tripCount, loopTripCount);
-    tripCount = builder.create<MulIOp>(loc, totalTC, thisTC);
-  }
-
-  return tripCount;
-}
-
 } // namespace omp
 } // namespace lower
 } // namespace Fortran
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index e2a109126810dd..5d802b50a8c76c 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -45,14 +45,12 @@ namespace internal {
 // TODO The following 2 functions are copied from "flang/Lower/OpenMP/Utils.h".
 // This duplication is temporary until we find a solution for a shared location
 // for these utils that does not introduce circular CMake deps.
-mlir::omp::MapInfoOp
-createMapInfoOp(mlir::OpBuilder &builder, mlir::Location loc,
-                mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name,
-                llvm::ArrayRef<mlir::Value> bounds,
-                llvm::ArrayRef<mlir::Value> members,
-                mlir::ArrayAttr membersIndex, uint64_t mapType,
-                mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
-                bool partialMap = false) {
+mlir::omp::MapInfoOp createMapInfoOp(
+    mlir::OpBuilder &builder, mlir::Location loc, mlir::Value baseAddr,
+    mlir::Value varPtrPtr, std::string name, llvm::ArrayRef<mlir::Value> bounds,
+    llvm::ArrayRef<mlir::Value> members, mlir::ArrayAttr membersIndex,
+    uint64_t mapType, mlir::omp::VariableCaptureKind mapCaptureType,
+    mlir::Type retTy, bool partialMap = false) {
   if (auto boxTy = llvm::dyn_cast<fir::BaseBoxType>(baseAddr.getType())) {
     baseAddr = builder.create<fir::BoxAddrOp>(loc, baseAddr);
     retTy = baseAddr.getType();
@@ -77,84 +75,6 @@ createMapInfoOp(mlir::OpBuilder &builder, mlir::Location loc,
   return op;
 }
 
-mlir::Value calculateTripCount(fir::FirOpBuilder &builder, mlir::Location loc,
-                               const mlir::omp::LoopRelatedClauseOps &ops) {
-  using namespace mlir::arith;
-  assert(ops.loopLowerBounds.size() == ops.loopUpperBounds.size() &&
-         ops.loopLowerBounds.size() == ops.loopSteps.size() &&
-         !ops.loopLowerBounds.empty() && "Invalid bounds or step");
-
-  // Get the bit width of an integer-like type.
-  auto widthOf = [](mlir::Type ty) -> unsigned {
-    if (mlir::isa<mlir::IndexType>(ty)) {
-      return mlir::IndexType::kInternalStorageBitWidth;
-    }
-    if (auto tyInt = mlir::dyn_cast<mlir::IntegerType>(ty)) {
-      return tyInt.getWidth();
-    }
-    llvm_unreachable("Unexpected type");
-  };
-
-  // For a type that is either IntegerType or IndexType, return the
-  // equivalent IntegerType. In the former case this is a no-op.
-  auto asIntTy = [&](mlir::Type ty) -> mlir::IntegerType {
-    if (ty.isIndex()) {
-      return mlir::IntegerType::get(ty.getContext(), widthOf(ty));
-    }
-    assert(ty.isIntOrIndex() && "Unexpected type");
-    return mlir::cast<mlir::IntegerType>(ty);
-  };
-
-  // For two given values, establish a common signless IntegerType
-  // that can represent any value of type of x and of type of y,
-  // and return the pair of x, y converted to the new type.
-  auto unifyToSignless =
-      [&](fir::FirOpBuilder &b, mlir::Value x,
-          mlir::Value y) -> std::pair<mlir::Value, mlir::Value> {
-    auto tyX = asIntTy(x.getType()), tyY = asIntTy(y.getType());
-    unsigned width = std::max(widthOf(tyX), widthOf(tyY));
-    auto wideTy = mlir::IntegerType::get(b.getContext(), width,
-                                         mlir::IntegerType::Signless);
-    return std::make_pair(b.createConvert(loc, wideTy, x),
-                          b.createConvert(loc, wideTy, y));
-  };
-
-  // Start with signless i32 by default.
-  auto tripCount = builder.createIntegerConstant(loc, builder.getI32Type(), 1);
-
-  for (auto [origLb, origUb, origStep] :
-       llvm::zip(ops.loopLowerBounds, ops.loopUpperBounds, ops.loopSteps)) {
-    auto tmpS0 = builder.createIntegerConstant(loc, origStep.getType(), 0);
-    auto [step, step0] = unifyToSignless(builder, origStep, tmpS0);
-    auto reverseCond =
-        builder.create<CmpIOp>(loc, CmpIPredicate::slt, step, step0);
-    auto negStep = builder.create<SubIOp>(loc, step0, step);
-    mlir::Value absStep =
-        builder.create<SelectOp>(loc, reverseCond, negStep, step);
-
-    auto [lb, ub] = unifyToSignless(builder, origLb, origUb);
-    auto start = builder.create<SelectOp>(loc, reverseCond, ub, lb);
-    auto end = builder.create<SelectOp>(loc, reverseCond, lb, ub);
-
-    mlir::Value range = builder.create<SubIOp>(loc, end, start);
-    auto rangeCond =
-        builder.create<CmpIOp>(loc, CmpIPredicate::slt, end, start);
-    std::tie(range, absStep) = unifyToSignless(builder, range, absStep);
-    // numSteps = (range /u absStep) + 1
-    auto numSteps = builder.create<AddIOp>(
-        loc, builder.create<DivUIOp>(loc, range, absStep),
-        builder.createIntegerConstant(loc, range.getType(), 1));
-
-    auto trip0 = builder.createIntegerConstant(loc, numSteps.getType(), 0);
-    auto loopTripCount =
-        builder.create<SelectOp>(loc, rangeCond, trip0, numSteps);
-    auto [totalTC, thisTC] = unifyToSignless(builder, tripCount, loopTripCount);
-    tripCount = builder.create<MulIOp>(loc, totalTC, thisTC);
-  }
-
-  return tripCount;
-}
-
 /// Check if cloning the bounds introduced any dependency on the outer region.
 /// If so, then either clone them as well if they are MemoryEffectFree, or else
 /// copy them to a new temporary and add them to the map and block_argument
@@ -664,7 +584,19 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
     mlir::IRMapping mapper;
 
     if (mapToDevice) {
+      // TODO: Currently the loop bounds for the outer loop are duplicated.
       mlir::omp::TargetOperands targetClauseOps;
+      genLoopNestClauseOps(doLoop.getLoc(), rewriter, loopNest, mapper,
+                           loopNestClauseOps, &targetClauseOps);
+
+      // Prevent mapping host-evaluated variables.
+      outermostLoopLiveIns.erase(
+          llvm::remove_if(outermostLoopLiveIns,
+                          [&](mlir::Value liveIn) {
+                            return llvm::is_contained(
+                                targetClauseOps.hostEvalVars, liveIn);
+                          }),
+          outermostLoopLiveIns.end());
 
       // The outermost loop will contain all the live-in values in all nested
       // loops since live-in values are collected recursively for all nested
@@ -673,16 +605,21 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
         targetClauseOps.mapVars.push_back(
             genMapInfoOpForLiveIn(rewriter, liveIn));
 
-      targetOp = genTargetOp(doLoop.getLoc(), rewriter, mapper,
-                             outermostLoopLiveIns, targetClauseOps);
+      targetOp =
+          genTargetOp(doLoop.getLoc(), rewriter, mapper, outermostLoopLiveIns,
+                      targetClauseOps, loopNestClauseOps);
       genTeamsOp(doLoop.getLoc(), rewriter);
     }
 
-    mlir::omp::ParallelOp parallelOp = genParallelOp(
-        doLoop.getLoc(), rewriter, loopNest, mapper, loopNestClauseOps);
+    mlir::omp::ParallelOp parallelOp =
+        genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper);
     // Only set as composite when part of `distribute parallel do`.
     parallelOp.setComposite(mapToDevice);
 
+    if (!mapToDevice)
+      genLoopNestClauseOps(doLoop.getLoc(), rewriter, loopNest, mapper,
+                           loopNestClauseOps);
+
     for (mlir::Value local : locals)
       looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
                                         rewriter);
@@ -694,23 +631,6 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
         genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps,
                     /*isComposite=*/mapToDevice);
 
-    // Now that we created the nested `ws.loop` op, we set can the `target` op's
-    // trip count.
-    if (mapToDevice) {
-      rewriter.setInsertionPoint(targetOp);
-      auto parentModule = doLoop->getParentOfType<mlir::ModuleOp>();
-      fir::FirOpBuilder firBuilder(rewriter, fir::getKindMapping(parentModule));
-
-      mlir::omp::LoopRelatedClauseOps loopClauseOps;
-      loopClauseOps.loopLowerBounds.push_back(lbOp->getResult(0));
-      loopClauseOps.loopUpperBounds.push_back(ubOp->getResult(0));
-      loopClauseOps.loopSteps.push_back(stepOp->getResult(0));
-
-      mlir::cast<mlir::omp::TargetOp>(targetOp).getTripCountMutable().assign(
-          Fortran::lower::omp::internal::calculateTripCount(
-              firBuilder, doLoop.getLoc(), loopClauseOps));
-    }
-
     rewriter.eraseOp(doLoop);
 
     // Mark `unordered` loops that are not perfectly nested to be skipped from
@@ -804,27 +724,29 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
         captureKind, rawAddr.getType());
   }
 
-  mlir::omp::TargetOp genTargetOp(mlir::Location loc,
-                                  mlir::ConversionPatternRewriter &rewriter,
-                                  mlir::IRMapping &mapper,
-                                  llvm::ArrayRef<mlir::Value> liveIns,
-                                  mlir::omp::TargetOperands &clauseOps) const {
+  mlir::omp::TargetOp
+  genTargetOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
+              mlir::IRMapping &mapper, llvm::ArrayRef<mlir::Value> mappedVars,
+              mlir::omp::TargetOperands &clauseOps,
+              mlir::omp::LoopNestOperands &loopNestClauseOps) const {
     auto targetOp = rewriter.create<mlir::omp::TargetOp>(loc, clauseOps);
+    auto argIface = llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*targetOp);
 
     mlir::Region &region = targetOp.getRegion();
 
-    llvm::SmallVector<mlir::Type> liveInTypes;
-    llvm::SmallVector<mlir::Location> liveInLocs;
+    llvm::SmallVector<mlir::Type> regionArgTypes;
+    llvm::SmallVector<mlir::Location> regionArgLocs;
 
-    for (mlir::Value liveIn : liveIns) {
-      liveInTypes.push_back(liveIn.getType());
-      liveInLocs.push_back(liveIn.getLoc());
+    for (auto var :
+         llvm::concat<const mlir::Value>(clauseOps.hostEvalVars, mappedVars)) {
+      regionArgTypes.push_back(var.getType());
+      regionArgLocs.push_back(var.getLoc());
     }
 
-    rewriter.createBlock(&region, {}, liveInTypes, liveInLocs);
+    rewriter.createBlock(&region, {}, regionArgTypes, regionArgLocs);
 
     for (auto [arg, mapInfoOp] :
-         llvm::zip_equal(region.getArguments(), clauseOps.mapVars)) {
+         llvm::zip_equal(argIface.getMapBlockArgs(), clauseOps.mapVars)) {
       auto miOp = mlir::cast<mlir::omp::MapInfoOp>(mapInfoOp.getDefiningOp());
       hlfir::DeclareOp liveInDeclare = genLiveInDeclare(rewriter, arg, miOp);
       mlir::Value miOperand = miOp.getVariableOperand(0);
@@ -841,6 +763,19 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
         mapper.map(origDeclareOp.getBase(), liveInDeclare.getBase());
     }
 
+    for (auto [arg, hostEval] : llvm::zip_equal(argIface.getHostEvalBlockArgs(),
+                                                clauseOps.hostEvalVars))
+      mapper.map(hostEval, arg);
+
+    for (unsigned i = 0; i < loopNestClauseOps.loopLowerBounds.size(); ++i) {
+      loopNestClauseOps.loopLowerBounds[i] =
+          mapper.lookup(loopNestClauseOps.loopLowerBounds[i]);
+      loopNestClauseOps.loopUpperBounds[i] =
+          mapper.lookup(loopNestClauseOps.loopUpperBounds[i]);
+      loopNestClauseOps.loopSteps[i] =
+          mapper.lookup(loopNestClauseOps.loopSteps[i]);
+    }
+
     fir::FirOpBuilder firBuilder(
         rewriter,
         fir::getKindMapping(targetOp->getParentOfType<mlir::ModuleOp>()));
@@ -909,7 +844,8 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
   void genLoopNestClauseOps(
       mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
       looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper,
-      mlir::omp::LoopNestOperands &loopNestClauseOps) const {
+      mlir::omp::LoopNestOperands &loopNestClauseOps,
+      mlir::omp::TargetOperands *targetClauseOps = nullptr) const {
     assert(loopNestClauseOps.loopLowerBounds.empty() &&
            "Loop nest bounds were already emitted!");
 
@@ -930,18 +866,21 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
       return result;
     };
 
-    for (auto &[doLoop, _] : loopNest) {
-      mlir::Operation *lbOp = doLoop.getLowerBound().getDefiningOp();
-      loopNestClauseOps.loopLowerBounds.push_back(
-          cloneBoundOrStepOpChain(lbOp)->getResult(0));
+    auto hostEvalCapture = [&](mlir::Value var,
+                               llvm::SmallVectorImpl<mlir::Value> &bounds) {
+      var = cloneBoundOrStepOpChain(var.getDefiningOp())->getResult(0);
+      bounds.push_back(var);
 
-      mlir::Operation *ubOp = doLoop.getUpperBound().getDefiningOp();
-      loopNestClauseOps.loopUpperBounds.push_back(
-          cloneBoundOrStepOpChain(ubOp)->getResult(0));
+      if (targetClauseOps)
+        targetClauseOps->hostEvalVars.push_back(var);
+    };
 
-      mlir::Operation *stepOp = doLoop.getStep().getDefiningOp();
-      loopNestClauseOps.loopSteps.push_back(
-          cloneBoundOrStepOpChain(stepOp)->getResult(0));
+    for (auto &[doLoop, _] : loopNest) {
+      hostEvalCapture(doLoop.getLowerBound(),
+                      loopNestClauseOps.loopLowerBounds);
+      hostEvalCapture(doLoop.getUpperBound(),
+                      loopNestClauseOps.loopUpperBounds);
+      hostEvalCapture(doLoop.getStep(), loopNestClauseOps.loopSteps);
     }
 
     loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
@@ -985,18 +924,15 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
     return result;
   }
 
-  mlir::omp::ParallelOp
-  genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
-                looputils::LoopNestToIndVarMap &loopNest,
-                mlir::IRMapping &mapper,
-                mlir::omp::LoopNestOperands &loopNestClauseOps) const {
+  mlir::omp::ParallelOp genParallelOp(mlir::Location loc,
+                                      mlir::ConversionPatternRewriter &rewriter,
+                                      looputils::LoopNestToIndVarMap &loopNest,
+                                      mlir::IRMapping &mapper) const {
     auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
     rewriter.createBlock(&parallelOp.getRegion());
     rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
 
     genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
-    genLoopNestClauseOps(loc, rewriter, loopNest, mapper, loopNestClauseOps);
-
     return parallelOp;
   }
 
diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
index 8faffa20bb942a..9e980171a0c58b 100644
--- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
@@ -394,45 +394,44 @@ mlir::omp::MapInfoOp genDescriptorMemberMaps(mlir::omp::MapInfoOp op,
     if (!mapClauseOwner)
       return;
 
-    auto addOperands = [&](mlir::OperandRange &mapVarsArr,
-                           mlir::MutableOperandRange &mutableOpRange,
-                           auto directiveOp) {
+    auto addOperands = [&](mlir::MutableOperandRange &mapVarsArr,
+                           mlir::Operation *directiveOp,
+                           unsigned mapArgsStart = 0) {
       llvm::SmallVector<mlir::Value> newMapOps;
-      for (size_t i = 0; i < mapVarsArr.size(); ++i) {
-        if (mapVarsArr[i] == op) {
-          for (auto [j, mapMember] : llvm::enumerate(op.getMembers())) {
-            newMapOps.push_back(mapMember);
-            // for TargetOp's which have IsolatedFromAbove we must align the
-            // new additional map operand with an appropriate BlockArgument,
-            // as the printing and later processing currently requires a 1:1
-            // mapping of BlockArgs to MapInfoOp's at the same placement in
-            // each array (BlockArgs and MapVars).
-            if (directiveOp) {
-              directiveOp.getRegion().insertArgument(i + j, mapMember.getType(),
-                                                     directiveOp->getLoc());
-            }
-          }
+      for (auto [i, mapVar] : llvm::enumerate(mapVarsArr)) {
+        if (mapVar.get() != op) {
+          newMapOps.push_back(mapVar.get());
+          continue;
         }
-        newMapOps.push_back(mapVarsArr[i]);
+
+        for (auto [j, mapMember] : llvm::enumerate(op.getMembers())) {
+          newMapOps.push_back(mapMember);
+          if (directiveOp)
+            directiveOp->getRegion(0).insertArgument(
+                mapArgsStart + i + j, mapMember.getType(), mapMember.getLoc());
+        }
+        newMapOps.push_back(mapVar.get());
       }
-      mutableOpRange.assign(newMapOps);
+      mapVarsArr.assign(newMapOps);
     };
 
+    auto argIface =
+        llvm::dyn_cast<mlir::omp::BlockArgOpenMPOpInterface>(target);
+
     if (auto mapClauseOwner =
             llvm::dyn_cast<mlir::omp::MapClauseOwningOpInterface>(target)) {
-      mlir::OperandRange mapVarsArr = mapClauseOwner.getMapVars();
-      mlir::MutableOperandRange mapMutableOpRange =
-          mapClauseOwner.getMapVarsMutable();
-      mlir::omp::TargetOp targetOp =
-          llvm::dyn_cast<mlir::omp::TargetOp>(target);
-      addOperands(mapVarsArr, mapMutableOpRange, targetOp);
+      mlir::MutableOperandRange mapVarsArr = mapClauseOwner.getMapVarsMutable();
+      unsigned blockArgInsertIndex =
+          argIface ? argIface.getMapBlockArgsStart() : 0;
+      addOperands(mapVarsArr, llvm::dyn_cast<mlir::omp::TargetOp>(target),
+                  blockArgInsertIndex);
     }
 
     if (auto targetDataOp = llvm::dyn_cast<mlir::omp::TargetDataOp>(target)) {
-      mlir::OperandRange useDevAddrArr = targetDataOp.getUseDeviceAddrVars();
-      mlir::MutableOperandRange useDevAddrMutableOpRange =
+      mlir::MutableOperandRange useDevAddrArr =
           targetDataOp.getUseDeviceAddrVarsMutable();
-      addOperands(useDevAddrArr, useDevAddrMutableOpRange, targetDataOp);
+      addOperands(useDevAddrArr, target,
+                  argIface.getUseDeviceAddrBlockArgsStart());
     }
   }
 
diff --git a/flang/test/Lower/OpenMP/FIR/mismatched-bound-types.f90 b/flang/test/Lower/OpenMP/FIR/mismatched-bound-types.f90
index e64d5450846ec6..8b24b34cb55b6a 100644
--- a/flang/test/Lower/OpenMP/FIR/mismatched-bound-types.f90
+++ b/flang/test/Lower/OpenMP/FIR/mismatched-bound-types.f90
@@ -1,7 +1,19 @@
 ! RUN: %flang_fc1 -fopenmp -emit-fir %s -o - | FileCheck %s
 
 ! Check that this testcase is lowered to FIR successfully.
-! CHECK: omp.target {{.*}}trip_count
+
+! CHECK: %[[ONE:.*]] = arith.constant 1 : i32
+! CHECK: %[[DECL_N:.*]] = fir.declare %{{.*}} {uniq_name = "_QMtestEn"} : (!fir.ref<i64>) -> !fir.ref<i64>
+! CHECK: %[[HOST_N:.*]] = fir.load %[[DECL_N]] : !fir.ref<i64>
+! CHECK: %[[HOST_LB:.*]] = fir.convert %[[ONE]] : (i32) -> i64
+! CHECK: %[[HOST_STEP:.*]] = fir.convert %[[ONE]] : (i32) -> i64
+! CHECK:      omp.target
+! CHECK-SAME: host_eval(%[[HOST_LB]] -> %[[LB:[[:alnum:]]+]], %[[HOST_N]] -> %[[UB:[[:alnum:]]+]], %[[HOST_STEP]] -> %[[STEP:[[:alnum:]]+]] : i64, i64, i64)
+! CHECK:      omp.teams
+! CHECK:      omp.parallel
+! CHECK:      omp.distribute
+! CHECK-NEXT: omp.wsloop
+! CHECK-NEXT: omp.loop_nest ({{.*}}) : i64 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]])
 
 module Test
     use, intrinsic :: ISO_Fortran_env, only: REAL64,INT64
diff --git a/flang/test/Lower/OpenMP/eval-outside-target.f90 b/flang/test/Lower/OpenMP/eval-outside-target.f90
index ef578610e8e908..d0925971e4b2bc 100644
--- a/flang/test/Lower/OpenMP/eval-outside-target.f90
+++ b/flang/test/Lower/OpenMP/eval-outside-target.f90
@@ -6,19 +6,15 @@
 subroutine teams()
   ! BOTH: omp.target
 
-  ! HOST-SAME: num_teams({{.*}}) teams_thread_limit({{.*}})
-
-  ! DEVICE-NOT: num_teams({{.*}})
-  ! DEVICE-NOT: teams_thread_limit({{.*}})
+  ! HOST-SAME: host_eval(%{{.*}} -> %[[NUM_TEAMS:.*]], %{{.*}} -> %[[THREAD_LIMIT:.*]] : i32, i32)
+  
+  ! DEVICE-NOT: host_eval({{.*}})
   ! DEVICE-SAME: {
   !$omp target
 
   ! BOTH: omp.teams
 
-  ! HOST-NOT: num_teams({{.*}})
-  ! HOST-NOT: thread_limit({{.*}})
-  ! HOST-SAME: {
-
+  ! HOST-SAME: num_teams( to %[[NUM_TEAMS]] : i32) thread_limit(%[[THREAD_LIMIT]] : i32)
   ! DEVICE-SAME: num_teams({{.*}}) thread_limit({{.*}})
   !$omp teams num_teams(1) thread_limit(2)
   call foo()
@@ -27,60 +23,19 @@ subroutine teams()
   !$omp end target
 
   ! BOTH: omp.teams
-  ! BOTH-SAME: num_teams({{.*}}) thread_limit({{.*}})
+  ! BOTH-SAME: num_teams({{.*}}) thread_limit({{.*}}) {
   !$omp teams num_teams(1) thread_limit(2)
   call foo()
   !$omp end teams
 end subroutine teams
 
-! BOTH-LABEL: func.func @_QPparallel
-subroutine parallel()
-  ! BOTH: omp.target
-
-  ! HOST-SAME: num_threads({{.*}})
-
-  ! DEVICE-NOT: num_threads({{.*}})
-  ! DEVICE-SAME: {
-  !$omp target
-
-  ! BOTH: omp.parallel
-
-  ! HOST-NOT: num_threads({{.*}})
-  ! HOST-SAME: {
-  
-  ! DEVICE-SAME: num_threads({{.*}})
-  !$omp parallel num_threads(1)
-  call foo()
-  !$omp end parallel
-  !$omp end target
-
-  ! BOTH: omp.target
-  ! BOTH-NOT: num_threads({{.*}})
-  ! BOTH-SAME: {
-  !$omp target
-  call foo()
-
-  ! BOTH: omp.parallel
-  ! BOTH-SAME: num_threads({{.*}})
-  !$omp parallel num_threads(1)
-  call foo()
-  !$omp end parallel
-  !$omp end target
-
-  ! BOTH: omp.parallel
-  ! BOTH-SAME: num_threads({{.*}})
-  !$omp parallel num_threads(1)
-  call foo()
-  !$omp end parallel
-end subroutine parallel
-
 ! BOTH-LABEL: func.func @_QPdistribute_parallel_do
 subroutine distribute_parallel_do()
   ! BOTH: omp.target
   
-  ! HOST-SAME: num_threads({{.*}})
+  ! HOST-SAME: host_eval(%{{.*}} -> %[[NUM_THREADS:.*]], %{{.*}} -> %[[LB:.*]], %{{.*}} -> %[[UB:.*]], %{{.*}} -> %[[STEP:.*]] : i32, i32, i32, i32)
   
-  ! DEVICE-NOT: num_threads({{.*}})
+  ! DEVICE-NOT: host_eval({{.*}})
   ! DEVICE-SAME: {
 
   ! BOTH: omp.teams
@@ -88,13 +43,14 @@ subroutine distribute_parallel_do()
 
   ! BOTH: omp.parallel
 
-  ! HOST-NOT: num_threads({{.*}})
-  ! HOST-SAME: {
-
+  ! HOST-SAME: num_threads(%[[NUM_THREADS]] : i32)
   ! DEVICE-SAME: num_threads({{.*}})
 
   ! BOTH: omp.distribute
   ! BOTH-NEXT: omp.wsloop
+  ! BOTH-NEXT: omp.loop_nest
+
+  ! HOST-SAME: (%{{.*}}) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]])
   !$omp distribute parallel do num_threads(1)
   do i=1,10
     call foo()
@@ -103,11 +59,11 @@ subroutine distribute_parallel_do()
   !$omp end target teams
 
   ! BOTH: omp.target
-  ! BOTH-NOT: num_threads({{.*}})
+  ! BOTH-NOT: host_eval({{.*}})
   ! BOTH-SAME: {
   ! BOTH: omp.teams
   !$omp target teams
-  call foo()
+  call foo() !< Prevents this from being SPMD.
 
   ! BOTH: omp.parallel
   ! BOTH-SAME: num_threads({{.*}})
@@ -139,9 +95,9 @@ end subroutine distribute_parallel_do
 subroutine distribute_parallel_do_simd()
   ! BOTH: omp.target
   
-  ! HOST-SAME: num_threads({{.*}})
+  ! HOST-SAME: host_eval(%{{.*}} -> %[[NUM_THREADS:.*]], %{{.*}} -> %[[LB:.*]], %{{.*}} -> %[[UB:.*]], %{{.*}} -> %[[STEP:.*]] : i32, i32, i32, i32)
   
-  ! DEVICE-NOT: num_threads({{.*}})
+  ! DEVICE-NOT: host_eval({{.*}})
   ! DEVICE-SAME: {
 
   ! BOTH: omp.teams
@@ -149,14 +105,15 @@ subroutine distribute_parallel_do_simd()
 
   ! BOTH: omp.parallel
 
-  ! HOST-NOT: num_threads({{.*}})
-  ! HOST-SAME: {
-  
+  ! HOST-SAME: num_threads(%[[NUM_THREADS]] : i32)
   ! DEVICE-SAME: num_threads({{.*}})
 
   ! BOTH: omp.distribute
   ! BOTH-NEXT: omp.wsloop
   ! BOTH-NEXT: omp.simd
+  ! BOTH-NEXT: omp.loop_nest
+
+  ! HOST-SAME: (%{{.*}}) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]])
   !$omp distribute parallel do simd num_threads(1)
   do i=1,10
     call foo()
@@ -165,11 +122,11 @@ subroutine distribute_parallel_do_simd()
   !$omp end target teams
 
   ! BOTH: omp.target
-  ! BOTH-NOT: num_threads({{.*}})
+  ! BOTH-NOT: host_eval({{.*}})
   ! BOTH-SAME: {
   ! BOTH: omp.teams
   !$omp target teams
-  call foo()
+  call foo() !< Prevents this from being SPMD.
 
   ! BOTH: omp.parallel
   ! BOTH-SAME: num_threads({{.*}})
diff --git a/flang/test/Lower/OpenMP/target-spmd.f90 b/flang/test/Lower/OpenMP/target-spmd.f90
index acb28a206a6788..bb90e5b3fc4857 100644
--- a/flang/test/Lower/OpenMP/target-spmd.f90
+++ b/flang/test/Lower/OpenMP/target-spmd.f90
@@ -3,7 +3,7 @@
 ! CHECK-LABEL: func.func @_QPdistribute_parallel_do_generic() {
 subroutine distribute_parallel_do_generic()
   ! CHECK: omp.target
-  ! CHECK-NOT: trip_count({{.*}})
+  ! CHECK-NOT: host_eval({{.*}})
   ! CHECK-SAME: {
   !$omp target
   !$omp teams
@@ -17,7 +17,7 @@ subroutine distribute_parallel_do_generic()
   !$omp end target
 
   ! CHECK: omp.target
-  ! CHECK-NOT: trip_count({{.*}})
+  ! CHECK-NOT: host_eval({{.*}})
   ! CHECK-SAME: {
   !$omp target teams
   !$omp distribute parallel do
@@ -29,7 +29,7 @@ subroutine distribute_parallel_do_generic()
   !$omp end target teams
 
   ! CHECK: omp.target
-  ! CHECK-NOT: trip_count({{.*}})
+  ! CHECK-NOT: host_eval({{.*}})
   ! CHECK-SAME: {
   !$omp target teams
   !$omp distribute parallel do
@@ -49,7 +49,7 @@ end subroutine distribute_parallel_do_generic
 ! CHECK-LABEL: func.func @_QPdistribute_parallel_do_spmd() {
 subroutine distribute_parallel_do_spmd()
   ! CHECK: omp.target
-  ! CHECK-SAME: trip_count({{.*}})
+  ! CHECK-SAME: host_eval({{.*}})
   !$omp target
   !$omp teams
   !$omp distribute parallel do
@@ -61,7 +61,7 @@ subroutine distribute_parallel_do_spmd()
   !$omp end target
 
   ! CHECK: omp.target
-  ! CHECK-SAME: trip_count({{.*}})
+  ! CHECK-SAME: host_eval({{.*}})
   !$omp target teams
   !$omp distribute parallel do
   do i = 1, 10
@@ -74,7 +74,7 @@ end subroutine distribute_parallel_do_spmd
 ! CHECK-LABEL: func.func @_QPdistribute_parallel_do_simd_generic() {
 subroutine distribute_parallel_do_simd_generic()
   ! CHECK: omp.target
-  ! CHECK-NOT: trip_count({{.*}})
+  ! CHECK-NOT: host_eval({{.*}})
   ! CHECK-SAME: {
   !$omp target
   !$omp teams
@@ -88,7 +88,7 @@ subroutine distribute_parallel_do_simd_generic()
   !$omp end target
 
   ! CHECK: omp.target
-  ! CHECK-NOT: trip_count({{.*}})
+  ! CHECK-NOT: host_eval({{.*}})
   ! CHECK-SAME: {
   !$omp target teams
   !$omp distribute parallel do simd
@@ -100,7 +100,7 @@ subroutine distribute_parallel_do_simd_generic()
   !$omp end target teams
 
   ! CHECK: omp.target
-  ! CHECK-NOT: trip_count({{.*}})
+  ! CHECK-NOT: host_eval({{.*}})
   ! CHECK-SAME: {
   !$omp target teams
   !$omp distribute parallel do simd
@@ -120,7 +120,7 @@ end subroutine distribute_parallel_do_simd_generic
 ! CHECK-LABEL: func.func @_QPdistribute_parallel_do_simd_spmd() {
 subroutine distribute_parallel_do_simd_spmd()
   ! CHECK: omp.target
-  ! CHECK-SAME: trip_count({{.*}})
+  ! CHECK-SAME: host_eval({{.*}})
   !$omp target
   !$omp teams
   !$omp distribute parallel do simd
@@ -132,7 +132,7 @@ subroutine distribute_parallel_do_simd_spmd()
   !$omp end target
 
   ! CHECK: omp.target
-  ! CHECK-SAME: trip_count({{.*}})
+  ! CHECK-SAME: host_eval({{.*}})
   !$omp target teams
   !$omp distribute parallel do simd
   do i = 1, 10
@@ -145,7 +145,7 @@ end subroutine distribute_parallel_do_simd_spmd
 ! CHECK-LABEL: func.func @_QPteams_distribute_parallel_do_spmd() {
 subroutine teams_distribute_parallel_do_spmd()
   ! CHECK: omp.target
-  ! CHECK-SAME: trip_count({{.*}})
+  ! CHECK-SAME: host_eval({{.*}})
   !$omp target
   !$omp teams distribute parallel do
   do i = 1, 10
@@ -158,7 +158,7 @@ end subroutine teams_distribute_parallel_do_spmd
 ! CHECK-LABEL: func.func @_QPteams_distribute_parallel_do_simd_spmd() {
 subroutine teams_distribute_parallel_do_simd_spmd()
   ! CHECK: omp.target
-  ! CHECK-SAME: trip_count({{.*}})
+  ! CHECK-SAME: host_eval({{.*}})
   !$omp target
   !$omp teams distribute parallel do simd
   do i = 1, 10
@@ -171,7 +171,7 @@ end subroutine teams_distribute_parallel_do_simd_spmd
 ! CHECK-LABEL: func.func @_QPtarget_teams_distribute_parallel_do_spmd() {
 subroutine target_teams_distribute_parallel_do_spmd()
   ! CHECK: omp.target
-  ! CHECK-SAME: trip_count({{.*}})
+  ! CHECK-SAME: host_eval({{.*}})
   !$omp target teams distribute parallel do
   do i = 1, 10
     call foo(i)
@@ -182,7 +182,7 @@ end subroutine target_teams_distribute_parallel_do_spmd
 ! CHECK-LABEL: func.func @_QPtarget_teams_distribute_parallel_do_simd_spmd() {
 subroutine target_teams_distribute_parallel_do_simd_spmd()
   ! CHECK: omp.target
-  ! CHECK-SAME: trip_count({{.*}})
+  ! CHECK-SAME: host_eval({{.*}})
   !$omp target teams distribute parallel do simd
   do i = 1, 10
     call foo(i)
diff --git a/flang/test/Transforms/DoConcurrent/basic_device.f90 b/flang/test/Transforms/DoConcurrent/basic_device.f90
index 2e659895719e58..5b5d1f5ff77c52 100644
--- a/flang/test/Transforms/DoConcurrent/basic_device.f90
+++ b/flang/test/Transforms/DoConcurrent/basic_device.f90
@@ -21,6 +21,18 @@ program do_concurrent_basic
 
     ! CHECK-NOT: fir.do_loop
 
+    ! CHECK: %[[DUPLICATED_C1:.*]] = arith.constant 1 : i32
+    ! CHECK: %[[DUPLICATED_LB:.*]] = fir.convert %[[DUPLICATED_C1]] : (i32) -> index
+    ! CHECK: %[[DUPLICATED_C10:.*]] = arith.constant 10 : i32
+    ! CHECK: %[[DUPLICATED_UB:.*]] = fir.convert %[[DUPLICATED_C10]] : (i32) -> index
+    ! CHECK: %[[DUPLICATED_STEP:.*]] = arith.constant 1 : index
+    
+    ! CHECK: %[[C1:.*]] = arith.constant 1 : i32
+    ! CHECK: %[[HOST_LB:.*]] = fir.convert %[[C1]] : (i32) -> index
+    ! CHECK: %[[C10:.*]] = arith.constant 10 : i32
+    ! CHECK: %[[HOST_UB:.*]] = fir.convert %[[C10]] : (i32) -> index
+    ! CHECK: %[[HOST_STEP:.*]] = arith.constant 1 : index
+
     ! CHECK-DAG: %[[I_MAP_INFO:.*]] = omp.map.info var_ptr(%[[I_ORIG_DECL]]#1
     ! CHECK: %[[C0:.*]] = arith.constant 0 : index
     ! CHECK: %[[UPPER_BOUND:.*]] = arith.subi %[[A_EXTENT]], %[[C0]] : index
@@ -32,10 +44,8 @@ program do_concurrent_basic
     ! CHECK-DAG: %[[A_MAP_INFO:.*]] = omp.map.info var_ptr(%[[A_ORIG_DECL]]#1 : {{[^(]+}})
     ! CHECK-SAME: map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[A_BOUNDS]])
 
-    ! CHECK: %[[TRIP_COUNT:.*]] = arith.muli %{{.*}}, %{{.*}} : i64
-
     ! CHECK: omp.target
-    ! CHECK-SAME: trip_count(%[[TRIP_COUNT]] : i64)
+    ! CHECK-SAME: host_eval(%[[HOST_LB]] -> %[[LB:[[:alnum:]]+]], %[[HOST_UB]] -> %[[UB:[[:alnum:]]+]], %[[HOST_STEP]] -> %[[STEP:[[:alnum:]]+]] : index, index, index)
     ! CHECK-SAME: map_entries(%[[I_MAP_INFO]] -> %[[I_ARG:[[:alnum:]]+]],
     ! CHECK-SAME:             %[[A_MAP_INFO]] -> %[[A_ARG:.[[:alnum:]]+]]
 
@@ -46,12 +56,6 @@ program do_concurrent_basic
     ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
     ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
-    ! CHECK: %[[C1:.*]] = arith.constant 1 : i32
-    ! CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index
-    ! CHECK: %[[C10:.*]] = arith.constant 10 : i32
-    ! CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index
-    ! CHECK: %[[STEP:.*]] = arith.constant 1 : index
-
     ! CHECK-NEXT: omp.distribute {
     ! CHECK-NEXT: omp.wsloop {
 
diff --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
index cc3e04306da1f2..13ee9bce85944f 100644
--- a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
+++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
@@ -34,7 +34,7 @@ program main
    do concurrent(i=1:n, j=1:m, k=1:l)
        a(i,j,k) = i * j + k
    end do
-end 
+end
 
 !--- perfectly_nested.f90
 program main
@@ -66,9 +66,48 @@ program main
    end do
 end
 
-! DEVICE: omp.target
+! COMMON: func.func @_QQmain
+
+! DEVICE: %[[DUPLICATED_C1_1:.*]] = arith.constant 1 : i32
+! DEVICE: %[[DUPLICATED_LB_I:.*]] = fir.convert %[[DUPLICATED_C1_1]] : (i32) -> index
+! DEVICE: %[[DUPLICATED_C10:.*]] = arith.constant 10 : i32
+! DEVICE: %[[DUPLICATED_UB_I:.*]] = fir.convert %[[DUPLICATED_C10]] : (i32) -> index
+! DEVICE: %[[DUPLICATED_STEP_I:.*]] = arith.constant 1 : index
+
+! DEVICE: %[[C1_1:.*]] = arith.constant 1 : i32
+! DEVICE: %[[HOST_LB_I:.*]] = fir.convert %[[C1_1]] : (i32) -> index
+! DEVICE: %[[C10:.*]] = arith.constant 10 : i32
+! DEVICE: %[[HOST_UB_I:.*]] = fir.convert %[[C10]] : (i32) -> index
+! DEVICE: %[[HOST_STEP_I:.*]] = arith.constant 1 : index
+
+! DEVICE: %[[C1_2:.*]] = arith.constant 1 : i32
+! DEVICE: %[[HOST_LB_J:.*]] = fir.convert %[[C1_2]] : (i32) -> index
+! DEVICE: %[[C20:.*]] = arith.constant 20 : i32
+! DEVICE: %[[HOST_UB_J:.*]] = fir.convert %[[C20]] : (i32) -> index
+! DEVICE: %[[HOST_STEP_J:.*]] = arith.constant 1 : index
+
+! DEVICE: %[[C1_3:.*]] = arith.constant 1 : i32
+! DEVICE: %[[HOST_LB_K:.*]] = fir.convert %[[C1_3]] : (i32) -> index
+! DEVICE: %[[C30:.*]] = arith.constant 30 : i32
+! DEVICE: %[[HOST_UB_K:.*]] = fir.convert %[[C30]] : (i32) -> index
+! DEVICE: %[[HOST_STEP_K:.*]] = arith.constant 1 : index
+
+! DEVICE: omp.target host_eval(
+! DEVICE-SAME: %[[HOST_LB_I]] -> %[[LB_I:[[:alnum:]]+]],
+! DEVICE-SAME: %[[HOST_UB_I]] -> %[[UB_I:[[:alnum:]]+]],
+! DEVICE-SAME: %[[HOST_STEP_I]] -> %[[STEP_I:[[:alnum:]]+]],
+! DEVICE-SAME: %[[HOST_LB_J]] -> %[[LB_J:[[:alnum:]]+]],
+! DEVICE-SAME: %[[HOST_UB_J]] -> %[[UB_J:[[:alnum:]]+]],
+! DEVICE-SAME: %[[HOST_STEP_J]] -> %[[STEP_J:[[:alnum:]]+]],
+! DEVICE-SAME: %[[HOST_LB_K]] -> %[[LB_K:[[:alnum:]]+]],
+! DEVICE-SAME: %[[HOST_UB_K]] -> %[[UB_K:[[:alnum:]]+]],
+! DEVICE-SAME: %[[HOST_STEP_K]] -> %[[STEP_K:[[:alnum:]]+]] :
+! DEVICE-SAME: index, index, index, index, index, index, index, index, index)
 ! DEVICE: omp.teams
 
+! HOST-NOT: omp.target
+! HOST-NOT: omp.teams
+
 ! COMMON: omp.parallel {
 
 ! COMMON-NEXT: %[[ITER_VAR_I:.*]] = fir.alloca i32 {bindc_name = "i"}
@@ -80,23 +119,23 @@ program main
 ! COMMON-NEXT: %[[ITER_VAR_K:.*]] = fir.alloca i32 {bindc_name = "k"}
 ! COMMON-NEXT: %[[BINDING_K:.*]]:2 = hlfir.declare %[[ITER_VAR_K]] {uniq_name = "_QFEk"}
 
-! COMMON: %[[C1_1:.*]] = arith.constant 1 : i32
-! COMMON: %[[LB_I:.*]] = fir.convert %[[C1_1]] : (i32) -> index
-! COMMON: %[[C10:.*]] = arith.constant 10 : i32
-! COMMON: %[[UB_I:.*]] = fir.convert %[[C10]] : (i32) -> index
-! COMMON: %[[STEP_I:.*]] = arith.constant 1 : index
-
-! COMMON: %[[C1_2:.*]] = arith.constant 1 : i32
-! COMMON: %[[LB_J:.*]] = fir.convert %[[C1_2]] : (i32) -> index
-! COMMON: %[[C20:.*]] = arith.constant 20 : i32
-! COMMON: %[[UB_J:.*]] = fir.convert %[[C20]] : (i32) -> index
-! COMMON: %[[STEP_J:.*]] = arith.constant 1 : index
-
-! COMMON: %[[C1_3:.*]] = arith.constant 1 : i32
-! COMMON: %[[LB_K:.*]] = fir.convert %[[C1_3]] : (i32) -> index
-! COMMON: %[[C30:.*]] = arith.constant 30 : i32
-! COMMON: %[[UB_K:.*]] = fir.convert %[[C30]] : (i32) -> index
-! COMMON: %[[STEP_K:.*]] = arith.constant 1 : index
+! HOST: %[[C1_1:.*]] = arith.constant 1 : i32
+! HOST: %[[LB_I:.*]] = fir.convert %[[C1_1]] : (i32) -> index
+! HOST: %[[C10:.*]] = arith.constant 10 : i32
+! HOST: %[[UB_I:.*]] = fir.convert %[[C10]] : (i32) -> index
+! HOST: %[[STEP_I:.*]] = arith.constant 1 : index
+
+! HOST: %[[C1_2:.*]] = arith.constant 1 : i32
+! HOST: %[[LB_J:.*]] = fir.convert %[[C1_2]] : (i32) -> index
+! HOST: %[[C20:.*]] = arith.constant 20 : i32
+! HOST: %[[UB_J:.*]] = fir.convert %[[C20]] : (i32) -> index
+! HOST: %[[STEP_J:.*]] = arith.constant 1 : index
+
+! HOST: %[[C1_3:.*]] = arith.constant 1 : i32
+! HOST: %[[LB_K:.*]] = fir.convert %[[C1_3]] : (i32) -> index
+! HOST: %[[C30:.*]] = arith.constant 30 : i32
+! HOST: %[[UB_K:.*]] = fir.convert %[[C30]] : (i32) -> index
+! HOST: %[[STEP_K:.*]] = arith.constant 1 : index
 
 ! DEVICE: omp.distribute