diff --git a/.gitmodules b/.gitmodules index e53cdced9..1bc95b52b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,3 +2,7 @@ path = examples/apps/aes/tiny-AES-c url = git@github.com:bespoke-silicon-group/tiny-AES-c.git update = none +[submodule "examples/cuda/nbody/Galois"] + path = examples/apps/nbody/Galois + url = git@github.com:bespoke-silicon-group/Galois.git + update = none diff --git a/examples/apps/nbody/.gitignore b/examples/apps/nbody/.gitignore new file mode 100644 index 000000000..2dcb38fd8 --- /dev/null +++ b/examples/apps/nbody/.gitignore @@ -0,0 +1 @@ +*.deploy diff --git a/examples/apps/nbody/Barneshut.cpp b/examples/apps/nbody/Barneshut.cpp new file mode 100644 index 000000000..3c924bb01 --- /dev/null +++ b/examples/apps/nbody/Barneshut.cpp @@ -0,0 +1,1318 @@ +/* + * This file belongs to the Galois project, a C++ library for exploiting + * parallelism. The code is being released under the terms of the 3-Clause BSD + * License (a copy is located in LICENSE.txt at the top-level directory). + * + * Copyright (C) 2018, The University of Texas at Austin. All rights reserved. + * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS + * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF + * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF + * DEALING OR USAGE OF TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH + * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances + * shall University be liable for incidental, special, indirect, direct or + * consequential damages or loss of profits, interruption of business, or + * related expenses which may arise from use of Software or Documentation, + * including but not limited to those resulting from defects in Software and/or + * Documentation, or loss or inaccuracy of data of any kind. + */ + +#include +#include + + +#include "galois/Galois.h" +#include "galois/Timer.h" +#include "galois/Bag.h" +#include "galois/Reduction.h" +#include "Lonestar/BoilerPlate.h" +#include "galois/runtime/Profile.h" + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include // std::sort + +std::ostream& operator<<(std::ostream& os, const Point& p) { + os << "(" << p[0] << "," << p[1] << "," << p[2] << ")"; + return os; +} + +const char* name = "Barnes-Hut N-Body Simulator"; +const char* desc = + "Simulates gravitational forces in a galactic cluster using the " + "Barnes-Hut n-body algorithm"; +const char* url = "barneshut"; + +static llvm::cl::opt +nbodies("n", llvm::cl::desc("Number of bodies (default value 10000)"), + llvm::cl::init(10000)); +static llvm::cl::opt +ntimesteps("steps", llvm::cl::desc("Number of steps (default value 1)"), + llvm::cl::init(1)); +static llvm::cl::opt seed("seed", + llvm::cl::desc("Random seed (default value 7)"), + llvm::cl::init(7)); + +static llvm::cl::opt kpath("k", + llvm::cl::desc("Kernel path (default value: kernel.riscv)"), + llvm::cl::init("kernel.riscv")); + +static llvm::cl::opt phase("phase", + llvm::cl::desc("Phase of Barnes Hut to run"), + llvm::cl::init("build")); + +static llvm::cl::opt npx("npx", + llvm::cl::desc("Number of X-pods"), + llvm::cl::init(0)); + +static llvm::cl::opt npy("npy", + llvm::cl::desc("Number of Y-pods"), + llvm::cl::init(0)); + +static llvm::cl::opt py("py", + llvm::cl::desc("Pod X ID to simulate"), + llvm::cl::init(0)); + +static llvm::cl::opt px("px", + llvm::cl::desc("Pod Y ID to simulate"), + llvm::cl::init(0)); + +Config config; + +/** + * InsertBag: Unordered collection of elements. This data structure + * supports scalable concurrent pushes but reading the bag can only be + * done serially. + */ +typedef galois::InsertBag Bodies; +typedef galois::InsertBag BodyPtrs; +// FIXME: reclaim memory for multiple steps +typedef galois::InsertBag Tree; + +// DR: Build an oct tree (duh) +struct BuildOctree { + + Tree& T; + + // DR: Insert body b into octree, + void insert(Body* b, Octree* node, float radius) const { + + // DR: Get the octant index by comparing the x,y,z of + // the body with the oct tree node. + int index = node->pos.getChildIndex(b->pos); + + // DR: getValue() is for lock + // DRTODO: Relaxed ordering? So not actually a lock? + // DR: Get the child node at the octant index + Node* child = node->child[index].getValue(); + + // go through the tree lock-free while we can + // DR: Recurse if the child node exists, and it is not a leaf. + if (child && !child->Leaf) { + insert(b, static_cast(child), radius * .5f); + return; + } + + // DR: Else, acquire the lock for that index + // This locking may be too fined grain? + node->child[index].lock(); + child = node->child[index].getValue(); + + // DR: If the child is null, set the value at the + // index to the current body and unlock + if (child == NULL) { + b->pred = node; + b->octant = index; + node->child[index].unlock_and_set(b); + return; + } + + // DR: I believe this is only partially correct and is + // a bug. I think the recursion above should also + // decrease the radius by .5 + radius *= 0.5f; + + // DR: If the child is a leaf, create a new node (this is where the real fun begins) + if (child->Leaf) { + // Expand leaf + + // DR: Create a new node. Using the index + // (which determines the octant), compute the + // radius as an offset from the current + // (x,y,z) positiion. + Octree* new_node = &T.emplace(updateCenter(node->pos, index, radius)); + + // DR: If the position of the former child, + // and the current child are identical then + // add some jitter to guarantee uniqueness. + if (b->pos == child->pos) { + bsg_pr_warn("Jittering\n"); + // Jitter point to gaurantee uniqueness. + float jitter = config.tol / 2; + assert(jitter < radius); + b->pos += (new_node->pos - b->pos) * jitter; + } + + // assert(node->pos != b->pos); + // node->child[index].unlock_and_set(new_node); + + // DR: Recurse, on the new node, inserting b + // again (this time it will succeed) + insert(b, new_node, radius); + // DR: Recurse, on the new node, inserting the child + insert(static_cast(child), new_node, radius); + + // DR: Finish + new_node->pred = node; + new_node->octant = index; + node->child[index].unlock_and_set(new_node); + } else { + // DR: Someone beat us to the lock, and now we no longer need it. + node->child[index].unlock(); + insert(b, static_cast(child), radius); + } + } +}; + +int hb_mc_manycore_device_build_tree(hb_mc_device_t *device, eva_t _config, unsigned int *nNodes, HBOctree *hnodes, eva_t _hnodes, unsigned int nBodies, HBBody *hbodies, eva_t _hbodies, float radius){ + hb_mc_dma_htod_t htod_bodies = { + .d_addr = _hbodies, + .h_addr = hbodies, + .size = sizeof(HBBody) * nBodies + }; + + hb_mc_dma_htod_t htod_nodes = { + .d_addr = _hnodes, + .h_addr = hnodes, + .size = sizeof(HBOctree) * (*nNodes) + }; + + unsigned int body_idx = TILE_GROUP_DIM_X * TILE_GROUP_DIM_Y; + eva_t _body_idx; + BSG_CUDA_CALL(hb_mc_device_malloc(device, sizeof(body_idx), &_body_idx)); + hb_mc_dma_htod_t htod_body_idx = { + .d_addr = _body_idx, + .h_addr = &body_idx, + .size = sizeof(body_idx) + }; + + // Node 0, the root, is already created. + // node_idx is the location of the first free node. + unsigned int node_idx = TILE_GROUP_DIM_X * TILE_GROUP_DIM_Y + 1; + eva_t _node_idx; + BSG_CUDA_CALL(hb_mc_device_malloc(device, sizeof(node_idx), &_node_idx)); + hb_mc_dma_htod_t htod_node_idx = { + .d_addr = _node_idx, + .h_addr = &node_idx, + .size = sizeof(node_idx) + }; + + BSG_CUDA_CALL(hb_mc_device_dma_to_device(device, &htod_body_idx, 1)); + BSG_CUDA_CALL(hb_mc_device_dma_to_device(device, &htod_node_idx, 1)); + BSG_CUDA_CALL(hb_mc_device_dma_to_device(device, &htod_bodies, 1)); + BSG_CUDA_CALL(hb_mc_device_dma_to_device(device, &htod_nodes, 1)); + + hb_mc_dimension_t tg_dim = { .x = TILE_GROUP_DIM_X, .y = TILE_GROUP_DIM_Y}; + hb_mc_dimension_t grid_dim = { .x = 1, .y = 1}; + // We can't transfer floats as arguments directly, so we pass it encoded as a binary value + + uint32_t fradius = *reinterpret_cast(&radius); + + //extern "C" void build(Config *pcfg, HBOctree *nodes, int nNodes, int *nidx, HBBody *bodies, int nBodies, int *bidx, unsigned int _radius){ + uint32_t cuda_argv[8] = {_config, _hnodes, (*nNodes), _node_idx, _hbodies, nBodies, _body_idx, fradius}; + BSG_CUDA_CALL(hb_mc_kernel_enqueue (device, grid_dim, tg_dim, "build", 8, cuda_argv)); + + /* Launch and execute all tile groups on device and wait for all to finish. */ + hb_mc_manycore_trace_enable(device->mc); + BSG_CUDA_CALL(hb_mc_device_tile_groups_execute(device)); + hb_mc_manycore_trace_disable(device->mc); + + hb_mc_dma_dtoh_t dtoh_nnodes = { + .d_addr = _node_idx, + .h_addr = nNodes, + .size = sizeof(node_idx) + }; + BSG_CUDA_CALL(hb_mc_device_dma_to_host(device, &dtoh_nnodes, 1)); + + hb_mc_dma_dtoh_t dtoh_nodes = { + .d_addr = _hnodes, + .h_addr = hnodes, + .size = sizeof(HBOctree) * (*nNodes) + }; + + BSG_CUDA_CALL(hb_mc_device_dma_to_host(device, &dtoh_nodes, 1)); + BSG_CUDA_CALL(hb_mc_device_free(device, _body_idx)); + BSG_CUDA_CALL(hb_mc_device_free(device, _node_idx)); + + /* Code doesn't work, but may be useful: + for(int n_i = 0; n_i < *nNodes; n_i++){ + HBOctree *cur = &hnodes[n_i]; + printf("Cur Position: %2.4f %2.4f %2.4f \n", cur->pos[0], cur->pos[1], cur->pos[2]); + + for(int c_i = 0; c_i <8; c_i++){ + eva_t child_eva = cur->child[c_i]; + int idx = (_hbodies - child_eva) / sizeof(*hbodies); + int rem = (_hbodies - child_eva) % sizeof(*hbodies); // Should be 0 + if(rem != 0){ + bsg_pr_err("Oops! Non-zero remainder\n"); + exit(1); + } + + if(idx){ + HBBody *child = &hbodies[idx]; + printf(" Leaf %d : %2.4f %2.4f %2.4f \n", child->pos[1], child->pos[1], child->pos[2]); + } + } + } + */ + + + return HB_MC_SUCCESS; +} + +// DR: Recursively compute center of mass +unsigned computeCenterOfMass(Octree* node) { + float mass = 0.0; + Point accum; + // DR: Why isn't num 0? + unsigned num = 1; + + // DR: Count of indices + int index = 0; + // DR: For each of the children within a node + for (int i = 0; i < 8; ++i) + // DR: If the node exists (not null), densify by + // shifting the values at the indices toward 0 + if (node->child[i].getValue()) + node->child[index++].setValue(node->child[i].getValue()); + + // DR: Set the remaining to 0 + for (int i = index; i < 8; ++i) + node->child[i].setValue(NULL); + + // DR: Set nChildren + node->nChildren = index; + node->cLeafs = 0; + + // DR: For each index + for (int i = 0; i < index; i++) { + // DR: Get the child at that index + Node* child = node->child[i].getValue(); + // DR: If the child is not a leaf, recurse, and compute it's center of mass + // computeCenterOfMass returns the number of leaves below a node (plus 1?) + if (!child->Leaf) { + num += computeCenterOfMass(static_cast(child)); + } else { + // DR: Set the leaf mask, and update number of leaves + node->cLeafs |= (1 << i); + ++num; + } + // DR: Update running mass total + mass += child->mass; + // DR: Update weighted position + accum += child->pos * child->mass; + } + + // DR: Update self mass + node->mass = mass; + + // DR: Update center of mass for node? + if (mass > 0.0) + node->pos = accum / mass; + return num; +} + +int hb_mc_manycore_device_summarize_centers(hb_mc_device_t *device, eva_t _config, unsigned int nNodes, HBOctree *hnodes, eva_t _hnodes, unsigned int nBodies, HBBody *hbodies, eva_t _hbodies){ + hb_mc_dma_htod_t htod_bodies = { + .d_addr = _hbodies, + .h_addr = hbodies, + .size = sizeof(HBBody) * nBodies + }; + + hb_mc_dma_htod_t htod_nodes = { + .d_addr = _hnodes, + .h_addr = hnodes, + .size = sizeof(HBOctree) * nNodes + }; + + unsigned int idx = 127; + eva_t _idx; + BSG_CUDA_CALL(hb_mc_device_malloc(device, sizeof(idx), &_idx)); + hb_mc_dma_htod_t htod_idx = { + .d_addr = _idx, + .h_addr = &idx, + .size = sizeof(idx) + }; + + hb_mc_dimension_t tg_dim = { .x = TILE_GROUP_DIM_X, .y = TILE_GROUP_DIM_Y}; + hb_mc_dimension_t grid_dim = { .x = 1, .y = 1}; + // We can't transfer floats as arguments directly, so we pass it encoded as a binary value + uint32_t cuda_argv[4] = {_hnodes, _hbodies, nBodies, _idx}; + + BSG_CUDA_CALL(hb_mc_device_dma_to_device(device, &htod_idx, 1)); + BSG_CUDA_CALL(hb_mc_device_dma_to_device(device, &htod_bodies, 1)); + BSG_CUDA_CALL(hb_mc_device_dma_to_device(device, &htod_nodes, 1)); + BSG_CUDA_CALL(hb_mc_kernel_enqueue (device, grid_dim, tg_dim, "summarize", 4, cuda_argv)); + + /* Launch and execute all tile groups on device and wait for all to finish. */ + hb_mc_manycore_trace_enable(device->mc); + BSG_CUDA_CALL(hb_mc_device_tile_groups_execute(device)); + hb_mc_manycore_trace_disable(device->mc); + + hb_mc_dma_dtoh_t dtoh = { + .d_addr = _hnodes, + .h_addr = hnodes, + .size = sizeof(HBOctree) * nNodes + }; + + BSG_CUDA_CALL(hb_mc_device_dma_to_host(device, &dtoh, 1)); + BSG_CUDA_CALL(hb_mc_device_free(device, _idx)); + + return HB_MC_SUCCESS; +} + +Point updateForce(Point delta, float psq, float mass) { + // Computing force += delta * mass * (|delta|^2 + eps^2)^{-3/2} + float idr = 1.0f / sqrt((float)(psq + config.epssq)); + float scale = mass * idr * idr * idr; + return delta * scale; +} + +struct ComputeForces { + // Optimize runtime for no conflict case + + Octree* top; + float root_dsq; + + ComputeForces(Octree* _top, float diameter) : top(_top) { + assert(diameter > 0.0 && "non positive diameter of bb"); + // DR: This calculation is effectively theta? + root_dsq = diameter * diameter * config.itolsq; + } + + // DR: Compute forces on a body + template + void computeForce(Body* b, Context& cnx) { + Point p = b->acc; + b->acc = Point(0.0, 0.0, 0.0); + iterate(*b, cnx); + b->vel += (b->acc - p) * config.dthf; + } + + struct Frame { + float dsq; + Octree* node; + Frame(Octree* _node, float _dsq) : dsq(_dsq), node(_node) {} + }; + + // DR: "Thread Loop" for computing the foces on a body + template + void iterate(Body& b, Context& cnx) { + std::deque::other> stack( + cnx.getPerIterAlloc()); + // DR: Perform DFS traversal of tree. If a node is + // "far enough" away, then do not consider its leaves + // and only consider it's center of mass. + // + // This is the Barnes-Hut approximation. If "far + // enough" is too large, this algorithm degrades to + // O(N^2). + stack.push_back(Frame(top, root_dsq)); + + while (!stack.empty()) { + const Frame f = stack.back(); + stack.pop_back(); + + // DR: Compute distance squared (to avoid sqrt) + Point p = b.pos - f.node->pos; + float psq = p.dist2(); + + // Node is far enough away, summarize contribution + // DR: If node "Far enough", summarize using + // center of mass instead of individual bodies + if (psq >= f.dsq) { + b.acc += updateForce(p, psq, f.node->mass); + continue; + } + + // DR: If the node is not far enough, recurse + // by adding all sub-nodes to the stack, and + // adding the contribution of the individual + // bodies. DR: Why /4? + float dsq = f.dsq * 0.25; + // DR: Iterate through the children + for (int i = 0; i < f.node->nChildren; i++) { + Node* n = f.node->child[i].getValue(); + assert(n); + // DR: If a sub-node is a body/leaf + if (f.node->cLeafs & (1 << i)) { + assert(n->Leaf); + if (static_cast(&b) != n) { + Point p = b.pos - n->pos; + b.acc += updateForce(p, p.dist2(), n->mass); + } + } else { +#ifndef GALOIS_CXX11_DEQUE_HAS_NO_EMPLACE + stack.emplace_back(static_cast(n), dsq); +#else + stack.push_back(Frame(static_cast(n), dsq)); +#endif + __builtin_prefetch(n); + } + } + } + } +}; + +struct centerXCmp { + template + bool operator()(const T& lhs, const T& rhs) const { + return lhs.pos[0] < rhs.pos[0]; + } +}; + +struct centerYCmp { + template + bool operator()(const T& lhs, const T& rhs) const { + return lhs.pos[1] < rhs.pos[1]; + } +}; + +struct centerYCmpInv { + template + bool operator()(const T& lhs, const T& rhs) const { + return rhs.pos[1] < lhs.pos[1]; + } +}; + +template +void divide(const Iter& b, const Iter& e, Gen& gen) { + if (std::distance(b, e) > 32) { + std::sort(b, e, centerXCmp()); + Iter m = galois::split_range(b, e); + std::sort(b, m, centerYCmpInv()); + std::sort(m, e, centerYCmp()); + divide(b, galois::split_range(b, m), gen); + divide(galois::split_range(b, m), m, gen); + divide(m, galois::split_range(m, e), gen); + divide(galois::split_range(m, e), e, gen); + } else { + std::shuffle(b, e, gen); + } +} + + +struct CheckAllPairs { + Bodies& bodies; + + CheckAllPairs(Bodies& b) : bodies(b) {} + + float operator()(const Body& body) const { + const Body* me = &body; + Point acc; + for (Bodies::iterator ii = bodies.begin(), ei = bodies.end(); ii != ei; + ++ii) { + Body* b = &*ii; + if (me == b) + continue; + Point delta = me->pos - b->pos; + float psq = delta.dist2(); + acc += updateForce(delta, psq, b->mass); + } + + float dist2 = acc.dist2(); + acc -= me->acc; + float retval = acc.dist2() / dist2; + return retval; + } +}; + +float checkAllPairs(Bodies& bodies, int N) { + Bodies::iterator end(bodies.begin()); + std::advance(end, N); + + return galois::ParallelSTL::map_reduce(bodies.begin(), end, + CheckAllPairs(bodies), + std::plus(), 0.0) / + N; +} + +std::ostream& operator<<(std::ostream& os, const Body& b) { + os << "(pos:" << b.pos << " vel:" << b.vel << " acc:" << b.acc + << " mass:" << b.mass << ")"; + return os; +} + +std::ostream& operator<<(std::ostream& os, const BoundingBox& b) { + os << "(min:" << b.min << " max:" << b.max << ")"; + return os; +} + +std::ostream& operator<<(std::ostream& os, const Config& c) { + os << "Barnes-Hut configuration:" + << " dtime: " << c.dtime << " eps: " << c.eps << " tol: " << c.tol; + return os; +} + +/* +void printRec(std::ofstream& file, Node* node, unsigned level) { + static const char* ct[] = { + "blue", "cyan", "aquamarine", "chartreuse", + "darkorchid", "darkorange", + "deeppink", "gold", "chocolate" + }; + + if (!node) return; + // DR: node->idx used to be node->owner, but node doesn't have an owner field. + file << "\"" << node << "\" [color=" << ct[node->idx / 4] << (node->idx % 4 + 1) + << (level ? "" : " style=filled") << " label = \"" << (node->Leaf ? "L" : "N") + << "\"];\n"; + if (!node->Leaf) { + Octree* node2 = static_cast(node); + for (int i = 0; i < 8 && node2->child[i].getValue(); ++i) { + if (level == 3 || level == 6) + file << "subgraph cluster_" << level << "_" << i << " {\n"; + file << "\"" << node << "\" -> \"" << node2->child[i].getValue() << "\"[weight=0.01]\n"; + printRec(file, node2->child[i].getValue(), level + 1); if (level == 3 || level == 6) file << "}\n"; + } +} + } + +void printTree(Octree* node) { + std::ofstream file("out.txt"); + file << "digraph octree {\n"; + file << "ranksep = 2\n"; + file << "root = \"" << node << "\"\n"; + // file << "overlap = scale\n"; + printRec(file, node, 0); + file << "}\n"; +} +*/ + +/** + * Generates random input according to the Plummer model, which is more + * realistic but perhaps not so much so according to astrophysicists + */ +void generateInput(Bodies& bodies, BodyPtrs& pBodies, int nbodies, int seed) { + float v, sq, scale; + Point p; + float PI = boost::math::constants::pi(); + + std::mt19937 gen(seed); +#if __cplusplus >= 201103L || defined(HAVE_CXX11_UNIFORM_INT_DISTRIBUTION) + std::uniform_real_distribution dist(0, 1); +#else + std::uniform_real dist(0, 1); +#endif + + float rsc = (3 * PI) / 16; + float vsc = sqrt(1.0 / rsc); + float a = 1.0; + std::vector tmp; + + for (int body = 0; body < nbodies; body++) { + // DR: r is the radius. 1.0 can be split into initial radius and "softening length" + float r = 1.0 * a / sqrt(pow(dist(gen) * 0.999, -2.0 / 3.0) - 1); + do { + for (int i = 0; i < 3; i++) + p[i] = dist(gen) * 2.0 - 1.0; + sq = p.dist2(); + } while (sq > 1.0); + scale = rsc * r / sqrt(sq); + + Body b; + b.mass = 1.0 / nbodies; + b.pos = p * scale; + // Generate velocity + do { + p[0] = dist(gen); + p[1] = dist(gen) * 0.1; + } while (p[1] > p[0] * p[0] * pow(1 - p[0] * p[0], 3.5)); + + // 1 can be replaced by _a*_a, a softening parameter + v = p[0] * sqrt(2.0 / sqrt(a * a + r * r)); + + do { + for (int i = 0; i < 3; i++) + p[i] = dist(gen) * 2.0 - 1.0; + sq = p.dist2(); + } while (sq > 1.0); + scale = vsc * v / sqrt(sq); + b.vel = p * scale; + b.Leaf = true; + tmp.push_back(b); + // pBodies.push_back(&bodies.push_back(b)); + } + + // sort and copy out + divide(tmp.begin(), tmp.end(), gen); + + galois::do_all( + galois::iterate(tmp), + [&pBodies, &bodies](const Body& b) { + pBodies.push_back(&(bodies.push_back(b))); + }, + galois::loopname("InsertBody")); +} + +/** + * Convert an x86 pointer to a corresponding EVA pointer using the + * type and index, and check for buffer overrun. + * @param[in] buf Buffer pointer #eva + * @param[in] sz Buffer size (in bytes) + * @param[in] index Array index to convert + * @param[out] hb_p Starting #eva of the array index + * @return HB_MC_INVALID the index is larger than can be represented + * by the buffer. HB_MC_SUCCESS otherwise. + */ +template +__attribute__((warn_unused_result)) +int hb_mc_manycore_eva_translate(eva_t buf, + size_t sz, + unsigned int idx, + eva_t *hb_p){ + *hb_p = buf + idx * sizeof(T); + if(*hb_p > (buf + sz)){ + bsg_pr_err("Error in translation!\n"); + return HB_MC_INVALID; + } + return HB_MC_SUCCESS; +} + +int hb_mc_manycore_check_host_conversion_nodes(unsigned int nNodes, Octree **nodes, HBOctree *hnodes, eva_t _hnodes){ + // DR: Check that the tree conversion happened correctly + NodeIdx node_i = 0; + // Don't check the root node (index 0) because it points to itself, start at index 1. + for (node_i = 1; node_i < nNodes; node_i ++){ + Octree *n = nodes[node_i]; + HBOctree *_n = &hnodes[node_i]; + int pred_i = (_n->pred - _hnodes)/sizeof(HBOctree); + // Check that Host and HB predecessors correspond + if(n->pred != nodes[pred_i]){ + bsg_pr_err("Error: Node %d predecessor does not match HBNode!\n", node_i); + return HB_MC_FAIL; + } + + // Check data match + if(!n->isMatch(*_n, false)){ + bsg_pr_err("Node %d Data Mismatch\n", node_i); + return HB_MC_FAIL; + } + + int octant_i = n->pred->pos.getChildIndex(_n->pos); + eva_t _p = hnodes[pred_i].child[octant_i]; + if(_p != (_hnodes + node_i * sizeof(HBOctree))){ + bsg_pr_err("Error: HBOctree Node %u EVA does not match predecessor EVA @ Octant index!\n", node_i); + return HB_MC_FAIL; + } + } + return HB_MC_SUCCESS; +} + +int hb_mc_manycore_check_host_conversion_bodies(unsigned int nBodies, Body **bodies, HBBody *hbodies, eva_t _hbodies, Octree **nodes, HBOctree *hnodes, eva_t _hnodes){ + BodyIdx body_i = 0; + for (body_i = 0 ; body_i < nBodies; body_i ++){ + Body *b = bodies[body_i]; + HBBody *_b = &hbodies[body_i]; + int pred_i = (_b->pred - _hnodes)/sizeof(HBOctree); + // Check that Host and HB predecessors correspond + if(b->pred != nodes[pred_i]){ + bsg_pr_err("Error: Body %d predecessor does not match HBBody!\n", body_i); + return HB_MC_FAIL; + } + + if(!b->isMatch(*_b, true)){ + bsg_pr_err("Body %d Data Mismatch\n", body_i); + return HB_MC_FAIL; + } + + // Check the predecessor's corresponding octant child eva matches the current body's eva. + int octant_i = b->pred->pos.getChildIndex(_b->pos); + eva_t _c = hnodes[pred_i].child[octant_i]; + bool leaf = _c & 1; + _c = _c & (~1); + if(_c != (_hbodies + body_i * sizeof(HBBody))){ + bsg_pr_err("Error: HBBody %d EVA does not match predecessor EVA @ Octant index!\n", body_i); + return HB_MC_FAIL; + } + if(!leaf){ + bsg_pr_err("Error: HBBody %d was not marked as leaf in predecessor!\n", body_i); + return HB_MC_FAIL; + } + } + return HB_MC_SUCCESS; +} + +// nodes[0] is the tree root +int hb_mc_manycore_host_build_tree(unsigned int nBodies, Body **bodies, HBBody *hbodies, eva_t _hbodies, + unsigned int nNodes, Octree **nodes, HBOctree *hnodes, eva_t _hnodes){ + + // Map of Octree node/body pointer to index + std::map nodeIdxMap; + std::map bodyIdxMap; + + NodeIdx node_i = 0; + BodyIdx body_i = 0; + int err = HB_MC_SUCCESS; + + // Start with the root node/top, and assign it index 0. + std::deque q; + q.push_back(nodes[0]); + // The root's predecessor points to itself + nodes[0]->pred = nodes[0]; + nodes[0]->convert(_hnodes, hnodes[node_i]); + nodeIdxMap[nodes[0]] = node_i++; + + while(!q.empty()){ + Octree *cur_p = q.back(); + q.pop_back(); + NodeIdx cur_i = nodeIdxMap[cur_p]; + + // Get this node's EVA Pointer to set children predecessors + eva_t _cur_p; + BSG_CUDA_CALL(hb_mc_manycore_eva_translate(_hnodes, sizeof(HBOctree) * nNodes, cur_i, &_cur_p)); + + for (int c_i = 0; c_i < Octree::octants; c_i++) { + Node *child = cur_p->child[c_i].getValue(); + if (child){ + cur_p->nChildren++; // Will be reset by CoM computation + hnodes[cur_i].nChildren++; + child->pred = cur_p; + if(!child->Leaf) { + // Internal octree node + Octree *c = static_cast(child); + + // Convert Octree node to HBOctree node + c->convert(_cur_p, hnodes[node_i]); + + // Set up child EVA pointer + BSG_CUDA_CALL(hb_mc_manycore_eva_translate(_hnodes, sizeof(HBOctree) * nNodes, node_i, &hnodes[cur_i].child[c_i])); + + nodes[node_i] = c; + nodeIdxMap[c] = node_i++; + + q.push_back(c); + } else { + hnodes[cur_i].cLeafs++; + cur_p->cLeafs++; + // Leaf/Body + Body *b = static_cast(child); + bodies[body_i] = b; + + // Convert Octree Body to HBBody + b->convert(_cur_p, hbodies[body_i]); + + // Set up child EVA pointer + eva_t _child; + BSG_CUDA_CALL(hb_mc_manycore_eva_translate(_hbodies, sizeof(HBBody) * nBodies, body_i, &_child)); + // Use low-order bit of the EV to signal leaf + hnodes[cur_i].child[c_i] = _child | 1; + + bodyIdxMap[b] = body_i++; + } + } + } + } + BSG_CUDA_CALL(hb_mc_manycore_check_host_conversion_nodes(nNodes, nodes, hnodes, _hnodes)); + BSG_CUDA_CALL(hb_mc_manycore_check_host_conversion_bodies(nBodies, bodies, hbodies, _hbodies, nodes, hnodes, _hnodes)); + + return HB_MC_SUCCESS; +} + +int hb_mc_manycore_device_compute_forces(hb_mc_device_t *device, eva_t _config, float diamsq, unsigned int nNodes, HBOctree *hnodes, eva_t _hnodes, unsigned int nBodies, HBBody *hbodies, eva_t _hbodies, int body_start, int body_end){ + + eva_t _start; + BSG_CUDA_CALL(hb_mc_device_malloc(device, sizeof(body_start), &_start)); + hb_mc_dma_htod_t htod_start = { + .d_addr = _start, + .h_addr = &body_start, + .size = sizeof(body_start) + }; + + hb_mc_dma_htod_t htod_bodies = { + .d_addr = _hbodies, + .h_addr = hbodies, + .size = sizeof(HBBody) * nBodies + }; + + hb_mc_dma_htod_t htod_nodes = { + .d_addr = _hnodes, + .h_addr = hnodes, + .size = sizeof(HBOctree) * nNodes + }; + /* + for(NodeIdx i =0 ; i < nNodes; i++){ + bsg_pr_info("Node: %u, %lx, pred: %x\n", i, _hnodes + sizeof(HBOctree) * i, hnodes[i].pred); + for(int c = 0; c < Octree::octants; c++){ + bsg_pr_info("\t Child: %x\n", hnodes[i].child[c]); + } + } + for(BodyIdx i =0 ; i < nBodies; i++){ + bsg_pr_info("Body: %u, %lx, pred: %x, Pos: %f %f %f\n", i, _hbodies + sizeof(HBBody) * i, hbodies[i].pred, hbodies[i].pos.val[0], hbodies[i].pos.val[1], hbodies[i].pos.val[2]); + }*/ + + hb_mc_dimension_t tg_dim = { .x = TILE_GROUP_DIM_X, .y = TILE_GROUP_DIM_Y}; + hb_mc_dimension_t grid_dim = { .x = 1, .y = 1}; + // We can't transfer floats as arguments directly, so we pass it encoded as a binary value + uint32_t fdiamsq = *reinterpret_cast(&diamsq); + uint32_t cuda_argv[7] = {_config, _hnodes, _hbodies, nBodies, fdiamsq, _start, body_end}; + + BSG_CUDA_CALL(hb_mc_device_dma_to_device(device, &htod_bodies, 1)); + BSG_CUDA_CALL(hb_mc_device_dma_to_device(device, &htod_nodes, 1)); + BSG_CUDA_CALL(hb_mc_device_dma_to_device(device, &htod_start, 1)); + BSG_CUDA_CALL(hb_mc_kernel_enqueue (device, grid_dim, tg_dim, "forces", 7, cuda_argv)); + + /* Launch and execute all tile groups on device and wait for all to finish. */ + hb_mc_manycore_trace_enable(device->mc); + BSG_CUDA_CALL(hb_mc_device_tile_groups_execute(device)); + hb_mc_manycore_trace_disable(device->mc); + + hb_mc_dma_dtoh_t dtoh = { + .d_addr = _hbodies, + .h_addr = hbodies, + .size = sizeof(HBBody) * nBodies + }; + + BSG_CUDA_CALL(hb_mc_device_dma_to_host(device, &dtoh, 1)); + + return HB_MC_SUCCESS; +} + +int hb_mc_manycore_device_update_bodies(hb_mc_device_t *device, eva_t _config, unsigned int nBodies, HBBody *hbodies, eva_t _hbodies){ + hb_mc_dma_htod_t htod = { + .d_addr = _hbodies, + .h_addr = hbodies, + .size = sizeof(HBBody) * nBodies + }; + + hb_mc_dimension_t tg_dim = { .x = TILE_GROUP_DIM_X, .y = TILE_GROUP_DIM_Y}; + hb_mc_dimension_t grid_dim = { .x = 1, .y = 1}; + uint32_t cuda_argv[3] = {_config, nBodies, _hbodies}; + + BSG_CUDA_CALL(hb_mc_device_dma_to_device(device, &htod, 1)); + BSG_CUDA_CALL(hb_mc_kernel_enqueue (device, grid_dim, tg_dim, "update", 3, cuda_argv)); + + /* Launch and execute all tile groups on device and wait for all to finish. */ + hb_mc_manycore_trace_enable(device->mc); + BSG_CUDA_CALL(hb_mc_device_tile_groups_execute(device)); + hb_mc_manycore_trace_disable(device->mc); + + hb_mc_dma_dtoh_t dtoh = { + .d_addr = _hbodies, + .h_addr = hbodies, + .size = sizeof(HBBody) * nBodies + }; + + BSG_CUDA_CALL(hb_mc_device_dma_to_host(device, &dtoh, 1)); + + return HB_MC_SUCCESS; +} + +int run(Bodies& bodies, BodyPtrs& pBodies, size_t nbodies) { + int rc = HB_MC_SUCCESS; + typedef galois::worklists::StableIterator WLL; + + + galois::preAlloc(galois::getActiveThreads() + + (3 * sizeof(Octree) + 2 * sizeof(Body)) * nbodies / + galois::runtime::pagePoolSize()); + galois::reportPageAlloc("MeminfoPre"); + hb_mc_device_t device; + std::string test_name = "Barnes-Hut Simulation"; + eva_t _config; + const std::string prog_phase = phase; + int pod_x = px; + int pod_y = py; + int npods_x = npx; + int npods_y = npy; + int npods = npods_x * npods_y; + int pod_id = pod_x + npods_x * py; + bsg_pr_info("Running Barnes-Hut Phase %s on pod X:%d Y:%d\n", prog_phase.c_str(), pod_x, pod_y); + BSG_CUDA_CALL(hb_mc_device_init_custom_dimensions(&device, test_name.c_str(), 0, { .x = TILE_GROUP_DIM_X, .y = TILE_GROUP_DIM_Y})); + BSG_CUDA_CALL(hb_mc_device_program_init(&device, "kernel.riscv", "default_allocator", 0)); + BSG_CUDA_CALL(hb_mc_device_malloc(&device, sizeof(config), &_config)); + hb_mc_dma_htod_t htod = { + .d_addr = _config, + .h_addr = &config, + .size = sizeof(config) + }; + BSG_CUDA_CALL(hb_mc_device_dma_to_device(&device, &htod, 1)); + + for (int step = 0; step < ntimesteps; step++) { + + auto mergeBoxes = [](const BoundingBox& lhs, const BoundingBox& rhs) { + return lhs.merge(rhs); + }; + + auto identity = []() { return BoundingBox(); }; + + // Do tree building sequentially + auto boxes = galois::make_reducible(mergeBoxes, identity); + + galois::do_all( + galois::iterate(pBodies), + [&boxes](const Body* b) { boxes.update(BoundingBox(b->pos)); }, + galois::loopname("reduceBoxes")); + + BoundingBox box = boxes.reduce(); + + // ============================================================ + // Build Octree (on the Host) + Tree t; + Point Median; + + for(int di =0; di < 3; ++di){ + std::vector pcs (nbodies); + int bi = 0; + for (auto b : pBodies) {; + pcs[bi++] = b->pos[di]; + } + printf("Pre-Median: %f\n", pcs[nbodies/2]); + std::sort(pcs.begin(), pcs.end()); + printf("Post-Median: %f\n", pcs[nbodies/2]); + Median[di] = pcs[nbodies/2]; + } + float median_rad = 0.0; + for (auto b : pBodies) { + median_rad = std::max(median_rad, (b->pos-Median).dist2()); + } + median_rad = std::sqrt(median_rad); + printf("Median Radius: %f\n", median_rad); + + + BuildOctree treeBuilder{t}; + Octree& top = t.emplace(box.center()); + + galois::StatTimer T_build("BuildTime"); + T_build.start(); + galois::do_all( + galois::iterate(pBodies), + [&](Body* body) { treeBuilder.insert(body, &top, box.radius()); }, + galois::loopname("BuildTree")); + T_build.stop(); + /* + for (int pod_x_i = 0; pod_x_i < 8; ++ pod_x_i){ + Node *pod_top = top.child[pod_x_i].getValue(); + for (int pod_y_i = 0; pod_y_i < 8; ++ pod_y_i){ + int nfound = 0; + Node *curn; + Octree *curo; + int octant = 0; + if(pod_top && pod_top->Leaf){ + nfound = 1; + } else if(pod_top){ + pod_top = static_cast(pod_top)->child[pod_y_i].getValue(); + curn = pod_top; + if(curn && curn->Leaf){ + nfound = 1; + break; + } + while(!((curn == pod_top) && (octant == Octree::octants))){ + printf("%d %d, %d\n", curn == pod_top, octant, curn->Leaf); + if(octant < Octree :: octants) { + curo = static_cast(curn); + while((curo->child[octant].getValue() == nullptr) && (octant < Octree::octants)){ + printf("Skip (Octant %d)\n", octant); + octant ++; + } + + if(curo->child[octant].getValue() != nullptr && (octant < Octree::octants) && curo->child[octant].getValue()->Leaf){ + nfound ++; + printf("Leaf: %d\n", nfound); + } else if((curo->child[octant].getValue() != nullptr) && (octant < Octree::octants)){ + curn = curo->child[octant].getValue(); + printf("Down (Octant %d)\n", octant); + octant = 0; + } + } + if(octant == Octree::octants){ + printf("Up - Node (Octant %d, Leaf %d %d)\n", octant, curn->Leaf, nfound); + octant = curn->octant + 1; + curn = curn->pred; + } + } + } + bsg_pr_info("Found %d bodies for X: %d, Y: %d\n", nfound, pod_x_i, pod_y_i); + } + }*/ + + // DR: Convert Bodies, Tree into arrays of Body and Octree + unsigned int nBodies = 0, nNodes = 0, i = 0; + for (auto ii : pBodies) { nBodies ++; } + for (auto ii : t) { nNodes ++; } + + bsg_pr_info("x86 Created %u Nodes\n", nNodes); + + Body **BodyPtrs = new Body*[nBodies]; + Octree **OctNodePtrs = new Octree*[nNodes]; + OctNodePtrs[0] = ⊤ + + // Host buffers + HBOctree *HostHBOctNodes = new HBOctree[nNodes](); + HBBody *HostHBBodies = new HBBody[nBodies](); + + // HammerBlade buffers. We don't know how many nodes + // HB will need to create, so we make a very rough + // estimate. + NodeIdx maxNodes = nBodies * std::log2(nBodies) + 128, nHBNodes = maxNodes; + HBOctree *DeviceHBOctNodes = new HBOctree[maxNodes](); + eva_t _DeviceHBOctNodes = 0, _HostHBOctNodes; + BSG_CUDA_CALL(hb_mc_device_malloc(&device, maxNodes * sizeof(HBOctree), &_DeviceHBOctNodes)); + BSG_CUDA_CALL(hb_mc_device_malloc(&device, nNodes * sizeof(HBOctree), &_HostHBOctNodes)); + bsg_pr_info("Nodes EVA (size): %x (%lu)\n", _DeviceHBOctNodes, maxNodes *sizeof(HBOctree)); + bsg_pr_info("Node Size: %lu\n", sizeof(HBNode)); + + HBBody *DeviceHBBodies = new HBBody[nBodies](); + eva_t _HostHBBodies, _DeviceHBBodies; + BSG_CUDA_CALL(hb_mc_device_malloc(&device, nBodies * sizeof(HBBody), &_DeviceHBBodies)); + BSG_CUDA_CALL(hb_mc_device_malloc(&device, nBodies * sizeof(HBBody), &_HostHBBodies)); + bsg_pr_info("Bodies EVA (size): %x (%lu)\n", _HostHBBodies, nBodies * sizeof(HBBody)); + bsg_pr_info("Body Size: %lu\n", sizeof(HBBody)); + + // If we use HostHBBodies to construct the array, they + // are produced from an in-order traversal of the Host + // Octree, and will have locality. This will increase + // contention. I believe the raw array has less + // contention in its ordering (and profiler stats + // support this). + BodyIdx body_i = 0; + for (auto b : pBodies) { + b->convert(0, DeviceHBBodies[body_i]); + body_i ++; + } + + // Convert Octree node to HBOctree node + top.convert(_DeviceHBOctNodes, DeviceHBOctNodes[0]); + DeviceHBOctNodes[0].nChildren = 0; + + // Perform in-order octree tree traversal to enumerate all nodes/bodies + // Use the node/body pointer to create a map between nodes/bodies and their index + // Convert the x86 nodes/bodies to HB equivalents for processing + + + BSG_CUDA_CALL(hb_mc_manycore_host_build_tree(nBodies, BodyPtrs, HostHBBodies, _HostHBBodies, + nNodes, OctNodePtrs, HostHBOctNodes, _HostHBOctNodes)); + bsg_pr_info("Host Tree building/conversion successful!\n"); + + + bsg_pr_info("Root Position: %2.4f %2.4f %2.4f, Radius: %2.4f \n", DeviceHBOctNodes[0].pos[0], DeviceHBOctNodes[0].pos[1], DeviceHBOctNodes[0].pos[2], box.radius()); + // Build tree on the device: + if(prog_phase == "build"){ + rc = hb_mc_manycore_device_build_tree(&device, _config, &nHBNodes, DeviceHBOctNodes, _DeviceHBOctNodes, nBodies, DeviceHBBodies, _DeviceHBBodies, box.radius()); + bsg_pr_info("Created %u HB Nodes\n", nHBNodes); + BSG_CUDA_CALL(hb_mc_device_free(&device, _config)); + BSG_CUDA_CALL(hb_mc_device_finish(&device)); + return rc; + // TODO: I don't know if the code for building a tree + // is 100% correct. Next step is to write a method + // that verifies the tree -- checking that the + // position of all the children at a node are within + // it's radius, and are in the correct octant. It + // would also be good to create a method that takes a + // HB tree and turns it back into a x86 tree so we can + // hand it back to the CPU code + + // TL;DR, the biggest next-step is verification. + } + + HBOctree *HBOctNodes; + eva_t _HBOctNodes; + HBBody *HBBodies; + eva_t _HBBodies; + + HBBodies = HostHBBodies; + _HBBodies = _HostHBBodies; + + HBOctNodes = HostHBOctNodes; + _HBOctNodes = _HostHBOctNodes; + // ============================================================ + + // ============================================================ + // Summarize centers of mass in tree + galois::timeThis( + [&](void) { + unsigned size = computeCenterOfMass(&top); + // printTree(&top); + std::cout << "Tree Size: " << size << "\n"; + }, + "summarize-Serial"); + + bsg_pr_info("Host masses summarization successful!\n"); + + if(prog_phase == "summarize"){ + BSG_CUDA_CALL(hb_mc_manycore_device_summarize_centers(&device, _config, nNodes, HBOctNodes, _HBOctNodes, nBodies, HBBodies, _HBBodies)); + + float pmse = 0.0f; + for(NodeIdx node_i = 0; node_i < nNodes; node_i++){ + // bsg_pr_info("HB: %2.4f %2.4f %2.4f (%2.4f)\n", HBOctNodes[node_i].pos.val[0], HBOctNodes[node_i].pos.val[1], HBOctNodes[node_i].pos.val[2], HBOctNodes[node_i].mass); + // bsg_pr_info("86: %2.4f %2.4f %2.4f (%2.4f)\n", OctNodePtrs[node_i]->pos.val[0], OctNodePtrs[node_i]->pos.val[1], OctNodePtrs[node_i]->pos.val[2], OctNodePtrs[node_i]->mass); + pmse += (HBOctNodes[node_i].pos - OctNodePtrs[node_i]->pos).dist2(); + } + bsg_pr_info("HB Center of Mass MSE: %f\n", pmse); + BSG_CUDA_CALL(hb_mc_device_free(&device, _config)); + BSG_CUDA_CALL(hb_mc_device_finish(&device)); + if(pmse > .01f) + return HB_MC_FAIL; + return HB_MC_SUCCESS; + } else { + // DR: Update centers of mass in the HB nodes, if the previous kernel is not run + for(NodeIdx node_i = 0; node_i < nNodes; node_i++){ + HBOctNodes[node_i].mass = OctNodePtrs[node_i]->mass; + HBOctNodes[node_i].pos = OctNodePtrs[node_i]->pos; + } + } + // ============================================================ + + + // ============================================================ + // Compute pair-wise forces using the Barnes-Hut Approximation + ComputeForces cf(&top, box.diameter()); + + galois::StatTimer T_compute("ComputeTime"); + T_compute.start(); + galois::for_each( + galois::iterate(pBodies), + [&](Body* b, auto& cnx) { cf.computeForce(b, cnx); }, + galois::loopname("compute"), galois::wl(), + galois::disable_conflict_detection(), galois::no_pushes(), + galois::per_iter_alloc()); + T_compute.stop(); + + bsg_pr_info("Host forces calculation successful!\n"); + + if(prog_phase == "forces"){ + // TODO: Check for non-power of 2 + int nbodies_per_pod = nBodies / npods; + int body_start = pod_id * nbodies_per_pod; + int body_end = (pod_id + 1) * nbodies_per_pod - 1; + // extern "C" void forces(Config *pcfg, HBOctree *proot, HBBody *HBBodies, int nBodies, unsigned int _diam, int body_start, int body_end){ + bsg_pr_info("Pod (x,y): (%d, %d) ID: %d processing nodes %d through %d of %d\n", pod_x, pod_y, pod_id, body_start, body_end, nBodies); + BSG_CUDA_CALL(hb_mc_manycore_device_compute_forces(&device, _config, box.diameter(), nNodes, HBOctNodes, _HBOctNodes, nBodies, HBBodies, _HBBodies, body_start, body_end)); + + // TODO: Check start-end + float amse = 0.0f; + for(BodyIdx body_i = body_start; body_i < body_end; body_i++){ + // bsg_pr_info("HB: %2.4f %2.4f %2.4f\n", HBBodies[body_i].acc.val[0], HBBodies[body_i].acc.val[1], HBBodies[body_i].acc.val[2]); + // bsg_pr_info("86: %2.4f %2.4f %2.4f\n", BodyPtrs[body_i]->acc.val[0], BodyPtrs[body_i]->acc.val[1], BodyPtrs[body_i]->acc.val[2]); + amse += (HBBodies[body_i].acc - BodyPtrs[body_i]->acc).dist2(); + } + + bsg_pr_info("Acceleration MSE: %f\n", amse); + BSG_CUDA_CALL(hb_mc_device_free(&device, _config)); + BSG_CUDA_CALL(hb_mc_device_finish(&device)); + if(amse > .01f) + return HB_MC_FAIL; + return HB_MC_SUCCESS; + } else { + // DR: Update centers of mass in the HB nodes + for(BodyIdx body_i = 0; body_i < nBodies; body_i++){ + HBBodies[body_i].acc = BodyPtrs[body_i]->acc; + HBBodies[body_i].vel = BodyPtrs[body_i]->vel; + } + } + // ============================================================ + + + // Verify the results using a classic pair-wise computation on a random sampling of nodes + if (!skipVerify) { + galois::timeThis( + [&](void) { + std::cout << "MSE (sampled) " + << checkAllPairs(bodies, std::min((int)nbodies, 100)) + << "\n"; + }, + "checkAllPairs"); + } + + // ============================================================ + // Update velocity and position. + galois::do_all( + galois::iterate(pBodies), + [](Body* b) { + Point dvel(b->acc); + dvel *= config.dthf; + Point velh(b->vel); + velh += dvel; + b->pos += velh * config.dtime; + b->vel = velh + dvel; + }, + galois::loopname("advance")); + + if(prog_phase == "update"){ + BSG_CUDA_CALL(hb_mc_manycore_device_update_bodies(&device, _config, nBodies, HBBodies, _HBBodies)); + + float pmse = 0.0f; + float vmse = 0.0f; + for(BodyIdx body_i = 0; body_i < nBodies; body_i++){ + pmse += (HBBodies[body_i].pos - BodyPtrs[body_i]->pos).dist2(); + vmse += (HBBodies[body_i].vel - BodyPtrs[body_i]->vel).dist2(); + } + bsg_pr_info("Position MSE: %f, Velocity MSE: %f\n", pmse/nBodies, vmse/nBodies); + if(pmse > .01f) + return HB_MC_FAIL; + if(vmse > .01f) + return HB_MC_FAIL; + BSG_CUDA_CALL(hb_mc_device_free(&device, _config)); + BSG_CUDA_CALL(hb_mc_device_finish(&device)); + return HB_MC_SUCCESS; + } + // ============================================================ + + std::cout << "Timestep " << step << " Center of Mass = "; + std::ios::fmtflags flags = + std::cout.setf(std::ios::showpos | std::ios::right | + std::ios::scientific | std::ios::showpoint); + std::cout << top.pos; + std::cout.flags(flags); + std::cout << "\n"; + + BSG_CUDA_CALL(hb_mc_device_free(&device, _HBBodies)); + BSG_CUDA_CALL(hb_mc_device_free(&device, _HBOctNodes)); + } + + galois::reportPageAlloc("MeminfoPost"); + BSG_CUDA_CALL(hb_mc_device_free(&device, _config)); + BSG_CUDA_CALL(hb_mc_device_finish(&device)); + return HB_MC_SUCCESS; +} + +int barneshut_main(int argc, char *argv[]) { + galois::SharedMemSys G; + LonestarStart(argc, argv, name, desc, url, nullptr); + + galois::StatTimer totalTime("TimerTotal"); + totalTime.start(); + + std::cout << config << "\n"; + std::cout << nbodies << " bodies, " << ntimesteps << " time steps\n"; + + Bodies bodies; + BodyPtrs pBodies; + generateInput(bodies, pBodies, nbodies, seed); + + galois::StatTimer execTime("Timer_0"); + execTime.start(); + run(bodies, pBodies, nbodies); + execTime.stop(); + + totalTime.stop(); + + return 0; +} + +declare_program_main("Barnes Hut", barneshut_main); diff --git a/examples/apps/nbody/Body.hpp b/examples/apps/nbody/Body.hpp new file mode 100644 index 000000000..99834947b --- /dev/null +++ b/examples/apps/nbody/Body.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include +#include + +typedef NodeIdx BodyIdx; + +struct _Body { + Point vel; + Point acc; +}; + +struct HBBody : public HBNode, public _Body{ +}; + +#ifndef RISCV +struct Body : public Node, public _Body { + void convert(eva_t pred, HBBody &b){ + Node::convert(pred, static_cast(b)); + _Body &_b = static_cast<_Body&>(b); + _b = static_cast<_Body>(*this); + } + + bool isMatch(HBBody &b, bool HBLeaf){ + return Node::isMatch(static_cast(b), HBLeaf) && b.vel == vel && b.acc == acc; + } +}; +#endif diff --git a/examples/apps/nbody/BoundingBox.hpp b/examples/apps/nbody/BoundingBox.hpp new file mode 100644 index 000000000..3f46db034 --- /dev/null +++ b/examples/apps/nbody/BoundingBox.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include + +// DR: What is a bounding box used for? +struct BoundingBox { + Point min; + Point max; + explicit BoundingBox(const Point& p) : min(p), max(p) {} + BoundingBox() + : min(std::numeric_limits::max()), + max(std::numeric_limits::min()) {} + + // DR: Merge two bounding boxes by taking the min and max + // x,y,z dimension from this box and another. + BoundingBox merge(const BoundingBox& other) const { + BoundingBox copy(*this); + + copy.min.pairMin(other.min); + copy.max.pairMax(other.max); + return copy; + } + + float diameter() const { return (max - min).minDim(); } + float radius() const { return diameter() * 0.5; } + // DR: Compute the geometric center of the bounding box + Point center() const { return (min + max) * 0.5; } +}; diff --git a/examples/apps/nbody/Config.hpp b/examples/apps/nbody/Config.hpp new file mode 100644 index 000000000..730f0a686 --- /dev/null +++ b/examples/apps/nbody/Config.hpp @@ -0,0 +1,12 @@ +#pragma once + +// DR: Configuration arguments +struct Config { + const float dtime; // length of one time step + const float eps; // potential softening parameter + const float tol; // tolerance for stopping recursion, <0.57 to bound error + const float dthf, epssq, itolsq; + Config() + : dtime(0.5), eps(0.05), tol(0.05), // 0.025), + dthf(dtime * 0.5), epssq(eps * eps), itolsq(1.0 / (tol * tol)) {} +}; diff --git a/examples/apps/nbody/Galois b/examples/apps/nbody/Galois new file mode 160000 index 000000000..6b836446f --- /dev/null +++ b/examples/apps/nbody/Galois @@ -0,0 +1 @@ +Subproject commit 6b836446fafdf7bef7cd9a9b824d77fea304ad14 diff --git a/examples/apps/nbody/Makefile b/examples/apps/nbody/Makefile new file mode 100644 index 000000000..d002b2520 --- /dev/null +++ b/examples/apps/nbody/Makefile @@ -0,0 +1,238 @@ +# Copyright (c) 2021, University of Washington All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, this list +# of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, this +# list of conditions and the following disclaimer in the documentation and/or +# other materials provided with the distribution. +# +# Neither the name of the copyright holder nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# This Makefile compiles, links, and executes examples Run `make help` +# to see the available targets for the selected platform. + +################################################################################ +# environment.mk verifies the build environment and sets the following +# makefile variables: +# +# LIBRAIRES_PATH: The path to the libraries directory +# HARDWARE_PATH: The path to the hardware directory +# EXAMPLES_PATH: The path to the examples directory +# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL +# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore +############################################################################### + +REPLICANT_PATH:=$(shell git rev-parse --show-toplevel) + +include $(REPLICANT_PATH)/environment.mk + +# TEST_NAME is the basename of the executable +TEST_NAME = main +# KERNEL_NAME is the name of the CUDA-Lite Kernel +KERNEL_NAME = nbody + +Galois: + git submodule init --update Galois + +Galois/install: + mkdir $@ + +Galois/build: + mkdir $@ + +Galois/build/CMakeCache.txt: | Galois/build + LLVM_DIR=$(shell llvm-config-11-64 --prefix) cmake -B $(dir $@) -S Galois -DBOOST_ROOT=/usr/include/boost169 -DCMAKE_INSTALL_PREFIX=Galois/install -DBUILD_SHARED_LIBS=ON + +Galois/install/lib64/libgalois_shmem.so: Galois/build/CMakeCache.txt | Galois/install + $(MAKE) -C Galois/build/ install + +setup: Galois/install/lib64/libgalois_shmem.so + + +############################################################################### +# Host code compilation flags and flow +############################################################################### +# TEST_SOURCES is a list of source files that need to be compiled + +vpath %.cpp $(REPLICANT_PATH)/examples/apps/nbody/Galois/lonestar/liblonestar/src/ +TEST_SOURCES = Barneshut.cpp +TEST_SOURCES += BoilerPlate.cpp + +TEST_HEADERS = Node.hpp Point.hpp Body.hpp Config.hpp + +TILE_GROUP_DIM_X ?= 16 +TILE_GROUP_DIM_Y ?= 8 + +DEFINES += -DTILE_GROUP_DIM_X=$(TILE_GROUP_DIM_X) +DEFINES += -DTILE_GROUP_DIM_Y=$(TILE_GROUP_DIM_Y) +DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE -D_DEFAULT_SOURCE +CDEFINES += +CXXDEFINES += + +INCLUDES = -I$(REPLICANT_PATH)/examples/apps/nbody/Galois/install/include -I$(REPLICANT_PATH)/examples/apps/nbody/Galois/lonestar/liblonestar/include/ +INCLUDES += -I$(shell llvm-config-11-64 --includedir) -I$(REPLICANT_PATH)/examples/apps/nbody/ +FLAGS = -O3 -g -Wall -Wno-unused-function -Wno-unused-variable -Wno-strict-aliasing -Wno-deprecated-declarations +CFLAGS += -std=c99 $(FLAGS) +CXXFLAGS += -std=c++17 $(FLAGS) +LDFLAGS = -lpthread -Wl,-rpath=$(REPLICANT_PATH)/examples/apps/nbody/Galois/install/lib64 -L$(REPLICANT_PATH)/examples/apps/nbody/Galois/install/lib64 -lgalois_shmem -lnuma $(shell llvm-config-11-64 --ldflags --libs) + +main.so: $(REPLICANT_PATH)/examples/apps/nbody/Galois/install/lib64/libgalois_shmem.so + +# compilation.mk defines rules for compilation of C/C++ +include $(EXAMPLES_PATH)/compilation.mk + +############################################################################### +# Host code link flags and flow +############################################################################### + +# link.mk defines rules for linking of the final execution binary. +include $(EXAMPLES_PATH)/link.mk + +$(TEST_OBJECTS): $(TEST_HEADERS) + +############################################################################### +# Device code compilation flow +############################################################################### + +# BSG_MANYCORE_KERNELS is a list of manycore executables that should +# be built before executing. + +update.rvo: RISCV_CXX = $(RISCV_CLANGXX) +forces.rvo: RISCV_CXX = $(RISCV_GCC) +build.rvo: RISCV_CXX = $(RISCV_CLANGXX) + +BSG_MANYCORE_KERNELS ?= kernel.riscv +kernel.riscv: update.rvo +kernel.riscv: forces.rvo +kernel.riscv: summarize.rvo +kernel.riscv: build.rvo + +RISCV_INCLUDES += -I$(REPLICANT_PATH)/examples/apps/nbody/ +RISCV_CCPPFLAGS += -D__KERNEL__ -O3 + +RISCV_DEFINES += -DTILE_GROUP_DIM_X=$(TILE_GROUP_DIM_X) +RISCV_DEFINES += -DTILE_GROUP_DIM_Y=$(TILE_GROUP_DIM_Y) -DRISCV +RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X) +RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y) + +include $(EXAMPLES_PATH)/cuda/riscv.mk + +############################################################################### +# Execution flow +# +# C_ARGS: Use this to pass arguments that you want to appear in argv +# For SPMD tests C arguments are: +# +# SIM_ARGS: Use this to pass arguments to the simulator +############################################################################### + +PHASE ?= forces +POD_ID_X ?= 0 +POD_ID_Y ?= 0 +NPODS_X ?= 8 +NPODS_Y ?= 8 +NBODIES ?= 128 + +C_ARGS ?= -n $(NBODIES) -steps 1 -t 1 -phase $(PHASE) -px $(POD_ID_X) -py $(POD_ID_Y) -npx $(NPODS_X) -npy $(NPODS_Y) + +SIM_ARGS ?= + +# Include platform-specific execution rules +include $(EXAMPLES_PATH)/execution.mk + +############################################################################### +# Regression Flow +############################################################################### + +regression: main.exec.log + @grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null + +############################################################################### +# Default rules, help, and clean +############################################################################### +.DEFAULT_GOAL := help +help: + @echo "Usage:" + @echo "make {clean | $(TEST_NAME).{profile,debug} | $(TEST_NAME).{profile,debug}.log}" + @echo " $(TEST_NAME).profile: Build executable with profilers enabled" + @echo " $(TEST_NAME).debug: Build waveform executable (if VCS)" + @echo " $(TEST_NAME).{profile,debug}.log: Run specific executable" + @echo " clean: Remove all subdirectory-specific outputs" + + +.PHONY: clean + +clean: + rm -rf bsg_link.ld +############################################################################### +# Sweep flow +############################################################################### + +# 16tx_1ty_threads__1_work__128_offset__18_logsz.deploy + +#_bh__x_x_id___nbodies__x_y__pods + +# bh__forces__ +podx-from-name = $(subst x, ,$(word 1, $(subst _, ,$(filter %_id,$(subst __, ,$1))))) +pody-from-name = $(subst y, ,$(word 2, $(subst _, ,$(filter %_id,$(subst __, ,$1))))) +npodsx-from-name = $(subst x, ,$(word 1, $(subst _, ,$(filter %_pods,$(subst __, ,$1))))) +npodsy-from-name = $(subst y, ,$(word 2, $(subst _, ,$(filter %_pods,$(subst __, ,$1))))) +phase-from-name = $(firstword $(subst _, ,$(filter %_bh,$(subst __, ,$1)))) +nbodies-from-name = $(firstword $(subst _, ,$(filter %_nbodies,$(subst __, ,$1)))) + +test-name = $(1)_bh__$(2)x_$(3)y_id__$(4)_nbodies__$(5)x_$(6)y_pods.deploy + +range = $(shell seq -s " " $1 $(shell echo $2 - 1 | bc)) +RANGE_PODS_X := $(call range,0,$(NPODS_X)) +RANGE_PODS_Y := $(call range,0,$(NPODS_Y)) + +# Forces, 8192 Bodies, 64 pods +$(foreach x,$(RANGE_PODS_X),\ +$(foreach y,$(RANGE_PODS_Y),\ +$(eval FORCES_TEST_8192 += $(call test-name,forces,$x,$y,8192,8,8)))) + +# Forces, 16384 Bodies, 64 pods +$(foreach x,$(RANGE_PODS_X),\ +$(foreach y,$(RANGE_PODS_Y),\ +$(eval FORCES_TEST_16384 += $(call test-name,forces,$x,$y,16384,8,8)))) + +# Forces, 32768 Bodies, 64 pods +$(foreach x,$(RANGE_PODS_X),\ +$(foreach y,$(RANGE_PODS_Y),\ +$(eval FORCES_TEST_32768 += $(call test-name,forces,$x,$y,32768,8,8)))) + +go: $(FORCES_TEST_8192) $(FORCES_TEST_16384) $(FORCES_TEST_32768) + +%.deploy: + mkdir $@ + echo "PHASE = $(call phase-from-name,$*)">> $@/Makefile + echo "POD_ID_X = $(call podx-from-name,$*)">> $@/Makefile + echo "POD_ID_Y = $(call pody-from-name,$*)">> $@/Makefile + echo "NPODS_X = $(call npodsx-from-name,$*)">> $@/Makefile + echo "NPODS_Y = $(call npodsy-from-name,$*)">> $@/Makefile + echo "NBODIES = $(call nbodies-from-name,$*)">> $@/Makefile + echo "VPATH =.." >> $@/Makefile + cat Makefile >> $@/Makefile + $(MAKE) -C $@ kernel.dis > $@/kernel.dis + $(MAKE) -C $@ main.so + #$(MAKE) -C $@ stats + +cleansweep: + rm -rf *.deploy diff --git a/examples/apps/nbody/Node.hpp b/examples/apps/nbody/Node.hpp new file mode 100644 index 000000000..2dd40f382 --- /dev/null +++ b/examples/apps/nbody/Node.hpp @@ -0,0 +1,37 @@ +#pragma once + +#include + +typedef unsigned int NodeIdx; + +struct _Node { + Point pos; // DR: X, Y, Z location + int octant; + float mass; +}; + +struct HBNode : public _Node{ +#ifndef RISCV + eva_t pred; // Used on Manycore +#else + HBNode *pred; +#endif +}; + +#ifndef RISCV +struct Node : public _Node{ +#ifndef RISCV + bool Leaf; +#endif + Node *pred; // Used on x86 + void convert(eva_t pred, HBNode &n){ + _Node &_n = static_cast<_Node&>(n); + _n = static_cast<_Node>(*this); + n.pred = pred; + } + bool isMatch(HBNode &n, bool HBleaf){ + return n.pos == pos && n.mass == mass && HBleaf == Leaf && n.octant == octant; + } + +}; +#endif diff --git a/examples/apps/nbody/Octree.hpp b/examples/apps/nbody/Octree.hpp new file mode 100644 index 000000000..45b7ea9cc --- /dev/null +++ b/examples/apps/nbody/Octree.hpp @@ -0,0 +1,86 @@ +#pragma once + +#include +#include +#ifdef RISCV +#define BSG_TILE_GROUP_X_DIM bsg_tiles_X +#define BSG_TILE_GROUP_Y_DIM bsg_tiles_Y +#include +#else +typedef std::atomic bsg_mcs_mutex_t; +#endif + +struct _Octree { + static const int octants = 8; + int nChildren; + char cLeafs; +}; + +struct HBOctree : public HBNode, public _Octree { + // DR: Note, no lock! +#ifndef RISCV + std::array child = {0}; + bsg_mcs_mutex_t mtx; +#else + std::array child = {0}; + bsg_mcs_mutex_t mtx; +#endif + HBOctree(const Point& p) { + HBNode::pos = p; + cLeafs = 0; + nChildren = 0; + } + HBOctree() { + } +}; + +#ifndef RISCV +struct Octree : public Node, public _Octree { + // DR: Note, lock! + std::array, 8> child; + + Octree(const Point& p) { + Node::pos = p; + Node::Leaf = false; + cLeafs = 0; + nChildren = 0; + for(int i = 0; i < 8; ++i){ + child[i].lock(); + child[i].unlock_and_set(nullptr); + } + } + void convert(eva_t pred, HBOctree &o){ + Node::convert(pred, static_cast(o)); + _Octree &_o = static_cast<_Octree&>(o); + _o = static_cast<_Octree>(*this); + } + bool isMatch(HBOctree &o, bool HBLeaf){ + return Node::isMatch(static_cast(o), HBLeaf) && o.cLeafs == cLeafs && o.nChildren == nChildren; + } +}; +#endif + + + +#ifdef RISCV +template +class HBOctreeTraverser{ +public: + bsg_mcs_mutex_node_t lcl, *lcl_as_glbl; + HBOctreeTraverser(){ + lcl_as_glbl = (bsg_mcs_mutex_node_t*)bsg_tile_group_remote_ptr(int, bsg_x, bsg_y, &lcl); + } + + // HBOctreeTraverser(const HBOctreeTraverser &t) = delete; + + void lock(unsigned int depth, HBOctree *cur){ + bsg_mcs_mutex_acquire(&cur->mtx, &lcl, lcl_as_glbl); + + } + + void unlock(unsigned int depth, HBOctree *cur){ + bsg_mcs_mutex_release(&cur->mtx, &lcl, lcl_as_glbl); + } +}; + +#endif diff --git a/examples/apps/nbody/Point.hpp b/examples/apps/nbody/Point.hpp new file mode 100644 index 000000000..9f9da2649 --- /dev/null +++ b/examples/apps/nbody/Point.hpp @@ -0,0 +1,128 @@ +/* + * This file belongs to the Galois project, a C++ library for exploiting + * parallelism. The code is being released under the terms of the 3-Clause BSD + * License (a copy is located in LICENSE.txt at the top-level directory). + * + * Copyright (C) 2018, The University of Texas at Austin. All rights reserved. + * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS + * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF + * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF + * DEALING OR USAGE OF TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH + * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances + * shall University be liable for incidental, special, indirect, direct or + * consequential damages or loss of profits, interruption of business, or + * related expenses which may arise from use of Software or Documentation, + * including but not limited to those resulting from defects in Software and/or + * Documentation, or loss or inaccuracy of data of any kind. + */ +#include +#pragma once +struct Point { + float val[3]; + Point() { val[0] = val[1] = val[2] = 0.0f; } + + Point(float _x, float _y, float _z) { + val[0] = _x; + val[1] = _y; + val[2] = _z; + } + + explicit Point(float v) { + val[0] = v; + val[1] = v; + val[2] = v; + } + + float operator[](const int index) const { return val[index]; } + + float& operator[](const int index) { return val[index]; } + + float x() const { return val[0]; } + + float y() const { return val[1]; } + + float z() const { return val[2]; } + + bool operator==(const Point& other) const { + return val[0] == other.val[0] && val[1] == other.val[1] && + val[2] == other.val[2]; + } + + bool operator!=(const Point& other) const { return !operator==(other); } + + // DR: pair-wise addition of two points + Point& operator+=(const Point& other) { + for (int i = 0; i < 3; ++i) + val[i] += other.val[i]; + return *this; + } + + Point& operator-=(const Point& other) { + for (int i = 0; i < 3; ++i) + val[i] -= other.val[i]; + return *this; + } + + Point& operator*=(float value) { + for (int i = 0; i < 3; ++i) + val[i] *= value; + return *this; + } + + Point operator-(const Point& other) const { + return Point(val[0] - other.val[0], val[1] - other.val[1], + val[2] - other.val[2]); + } + + Point operator+(const Point& other) const { + return Point(val[0] + other.val[0], val[1] + other.val[1], + val[2] + other.val[2]); + } + + Point operator*(float d) const { + return Point(val[0] * d, val[1] * d, val[2] * d); + } + + Point operator/(float d) const { + return Point(val[0] / d, val[1] / d, val[2] / d); + } + + float dist2() const { return dot(*this); } + + float dot(const Point& p2) const { + return val[0] * p2.val[0] + val[1] * p2.val[1] + val[2] * p2.val[2]; + } + + // DR: Compute the pair-wise minima/maxima between this point and + // another. Store the minima/maxima of each pair in this point. + void pairMin(const Point& p2) { + for (int i = 0; i < 3; ++i) + if (p2.val[i] < val[i]) + val[i] = p2.val[i]; + } + + void pairMax(const Point& p2) { + for (int i = 0; i < 3; ++i) + if (p2.val[i] > val[i]) + val[i] = p2.val[i]; + } + + // DR: Compute the minimum scalar of the and x,y,z dimension + float minDim() const { return std::min(val[0], std::min(val[1], val[2])); } + + char getChildIndex(const Point& b) { + char index = 0; + for (int i = 0; i < 3; ++i) + if (val[i] < b[i]) + index |= (1 << i); + return index; + } +}; + +// DR: Maybe goes in Octree.hpp? +inline Point updateCenter(Point v, int index, float radius) { + for (int i = 0; i < 3; i++) + v[i] += (index & (1 << i)) > 0 ? radius : -radius; + return v; +} diff --git a/examples/apps/nbody/README.md b/examples/apps/nbody/README.md new file mode 100644 index 000000000..eb8743cba --- /dev/null +++ b/examples/apps/nbody/README.md @@ -0,0 +1,47 @@ +# Barnes-Hut N-body Simulation App + +The Barnes-Hut algorithm is an algorithm for estimating the +interactions between N bodies in 3D space. You can read more about it +here: [Wikipedia](https://en.wikipedia.org/wiki/Barnes%E2%80%93Hut_simulation) + +This workload is derived from the Lonestar Benchmark Suite. + +# Summary: + +There are four kernels in the application: + +1. Build Tree (Varying parallelism, irregular memory accesses, requires allocation) +2. Summarize Centers of Mass (Recursive, varying parallelism, irregular memory accesses) +3. Compute Forces (Do-all parallelism, extremely irregular memory accesses, **main computational load**) +4. Update positions (Do-all paralllelism, regular memory accesses) + +Sometimes, authors will write about 1-2 of the kernels (Compute Forces, Update Positions) and ignore the others. + +The basic flow is to build an octree (1), summarize the center of mass +below each internal node (2), compute forces on each body using the +centers of masses as a distance-filtered approximation (3), and then +update the positions of each object using its computed force and mass(4). + + +These steps are encapsulated as kernels in the following files: +- `build.cpp` +- `summarize.cpp` +- `forces.cpp` +- `update.cpp` + +The host file is called `Barneshut.cpp` and uses modified example code +from the Galois library. Once the application is built you can choose +the number of bodies and phase of the computation to run on +HammerBlade. + +# Setup + +You will need a relatively modern version of GCC that supports the C++14 standard, and Boost> 1.69 + +1. Run `make Galois` +2. Run `make setup` +3. Run `make exec.log` + +This will run the simulation with the default parameters -- 8192 +bodies for the Build Tree kernels (step 1, above). You can change +which step is the default in the Makefile. \ No newline at end of file diff --git a/examples/apps/nbody/build.cpp b/examples/apps/nbody/build.cpp new file mode 100644 index 000000000..11494201d --- /dev/null +++ b/examples/apps/nbody/build.cpp @@ -0,0 +1,241 @@ + +#define BSG_TILE_GROUP_X_DIM bsg_tiles_X +#define BSG_TILE_GROUP_Y_DIM bsg_tiles_Y +#include +#include +#include +#include +#include +#include +#include +#include + +// Given a list of bodies, build an octree + + + +// nNodes - How many nodes have been pre-allocated (Should be at least +// nBodies * log2(nBodies) and more than 128). Not currently used, but +// tiles could check that they haven't gone beyond the end of the +// array. + +// This kernel hasn't been completely tested, or optimized. It's not +// clear that all the fences are necessary, for example. In some cases +// I want the compiler to insert memory fences, not actual fence +// instructions + +extern "C" void build(Config *pcfg, HBOctree *nodes, int nNodes, int *nidx, HBBody *bodies, int nBodies, int *bidx, unsigned int _radius){ + + Config cfg = *pcfg; + HBOctreeTraverser<3> t; + HBOctree *cur; + HBNode *child; + HBBody *ins; + // To avoid hammering the nidx "allocator" and bidx "work + // queue" at the start we pre-allocate every tile a new node, + // and every tile a body. + // + // NOTE: This does mean that nNodes > 128 + // We add one because the root is already created + int local_nidx = __bsg_id + 1, local_bidx = __bsg_id, depth; + HBOctree *newNode = &nodes[local_nidx]; + unsigned int remaining_children = 0; + unsigned int octant; + + bsg_barrier_hw_tile_group_init(); + bsg_barrier_hw_tile_group_sync(); + bsg_cuda_print_stat_kernel_start(); + bsg_cuda_print_stat_start(1); + bsg_fence(); + + // TODO: The biggest remaining challenge is to maximize the + // load-use distance of the Point position variables + // + // Using bsg_attr_remote is probably the right way forward, + // but C++/LLVM has a lot of issues resolving cross-address + // space assignments. + // + // For the moment, I have tried to separate loading the Point + // triple from using it. It doesn't work perfectly, but it's + // better than nothing. + + // While bodies remain in the "work queue" + while(local_bidx < nBodies){ + //bsg_print_hexadecimal(0x01000000 | local_bidx); + float radius = *reinterpret_cast(&_radius); + ins = &bodies[local_bidx]; // Find a new node to insert + Point ins_pos = ins->pos; + Point cur_pos; + cur = nodes; // Start at root + cur_pos = cur->pos; + + // Get the octant relative to the current node + octant = cur_pos.getChildIndex(ins_pos); + child = cur->child[octant]; + + // We may use the depth to switch between a spin lock + // and a MCS lock in the future. An MCS lock is great + // for high contention but has higher overhead. A spin + // lock is great for low contention but can cause + // network congestion. + depth = 0; + + // "Recurse" until a null pointer or a leaf is found. + recurse: + while(child && !(((unsigned int) child) & 1)){ + cur = (HBOctree *)child; + cur_pos = cur->pos; + octant = cur_pos.getChildIndex(ins_pos); + //bsg_print_hexadecimal(0x02000000 | octant); + child = cur->child[octant]; + //bsg_print_hexadecimal((unsigned int) child); + depth++; + // This radius update does not match the CPU code but it does seem to match the GPU code. + // The compiler tries to schedule the access child = cur->child[octant] after the radius computation. + // These force the compiler to do otherwise. + asm volatile("" ::: "memory"); + radius *= 0.5f; + asm volatile("" ::: "memory"); + } + + // We have reached the point where we need to modify a + // node. Lock it. + //bsg_print_hexadecimal(0x10000000 | local_bidx); + t.lock(depth, cur); + + + // Make sure that the compiler doesn't use stale + // values from cur->child and child->Leaf. This can + // happen when another tile grabs the lock before us + // and updates cur->child[octant]. + std::atomic_thread_fence(std::memory_order_acquire); + child = cur->child[octant]; + // DR: These next few operations could be squashed, + // but we need to increase the load-use distance + // between child and checking if it is a nullptr. + ins->pred = cur; + ins->octant = octant; + + // A null pointer means we can directly insert the current body as a leaf. + if(cur->child[octant] == nullptr){ + //bsg_print_hexadecimal(0x11000000); + + bsg_amoadd(&(cur->nChildren), 1); + + // There is a race condition here. A running + // tile can see that cur->child is not null + // and assume that the lock is not taken. I + // believe this is OK though, since all tiles + // waiting on the lock will see any subsequent + // updates made by the winning tile. The cost + // of removing this race condition is a lot + // more locking. + // DR: The low-order bit of the address is a leaf flag. + cur->child[octant] = (HBBody *)(((unsigned int) ins) | 1); + + // Fence before unlocking? Max's lock might handle this. + // This should probably be before insertion. + std::atomic_thread_fence(std::memory_order_release); + // Update our local index for inserting a new node + local_bidx = bsg_amoadd(bidx, 1); + t.unlock(depth, cur); + // We've inserted a new node. Start external while loop again. + continue; + } + + // Leaf/body means we need to create a new node and + // insert both bodies + // DR: The low-order bit of the pchild address indicates it is a leaf. + if(((unsigned int) child) & 1){ + //bsg_print_hexadecimal(0x12000000 | local_nidx); + //bsg_print_float(radius); + //bsg_print_float((inps->pos - child->pos).dist2()); + HBBody * leaf = (HBBody *) (((unsigned int)child) & (~1)); + Point leaf_pos = leaf->pos; + + if(radius == 0.0f){ + bsg_print_hexadecimal(0xe0000000 | local_bidx); + local_bidx = bsg_amoadd(bidx, 1); + t.unlock(depth, cur); + continue; + } + + // Octree* new_node = &T.emplace(updateCenter(node->pos, index, radius)); + // We are inserting a new node below this one, so pass it it's radius. + newNode->pos = updateCenter(cur_pos, octant, radius * .5f); + + // Now that we used local_nidx, get a new one. + local_nidx = bsg_amoadd(nidx, 1); + + // Insert original child + char new_octant = newNode->pos.getChildIndex(leaf_pos); + // Child already had the LOB set for leaf + newNode->child[new_octant] = child; + + // DR: If the position of the former child, + // and the current child are identical then + // add some jitter to guarantee uniqueness. + // Usually we don't use == for floats, but it + // is important in this case. + if(ins_pos == leaf_pos){ + bsg_print_hexadecimal(0xf0000000 | local_bidx); + float jitter = cfg.tol * .5f; + // assert(jitter < radius); + ins->pos += (newNode->pos - ins_pos) * jitter; + } + + // Fence after setting children, so that when + // we set cur->child other tiles won't see + // outdated state. + std::atomic_thread_fence(std::memory_order_release); + + // Set child of cur to newNode. There is a + // race condition here: A running tile can see + // that cur->child is not null and assume that + // the lock is not taken. I believe this is OK + // though, because the state of the child is + // consistent (because we fenced, above) + // before we issued this write. + cur->child[octant] = newNode; + + // Now we just need to fence before unlocking + // so that tiles on the lock won't see + // inconsistent state of cur->child[octant]. + std::atomic_thread_fence(std::memory_order_release); + + // We have inserted a new node at the former + // location of child. Set child for the next + // iteration of the recursion loop. + child = newNode; + + // We are finished updating newNode, so we can + // get a new pointer. This also maximizes + // load-use distance. + newNode = &nodes[local_nidx]; + } + + // If both conditions above fail, another tile managed + // to insert while we waited for a lock. + + // If we entered child->Leaf, then we created a new + // node, and inserted the previous cur->child[octant] + // into that node. We should try inserting our current + // body again. + + // Either outcome, the result is the same. Try inserting again. + + // Unlock + //bsg_print_hexadecimal(0x14000000); + t.unlock(depth, cur); + // Retry inserting the existing body. + goto recurse; + } + + bsg_cuda_print_stat_end(1); + bsg_barrier_hw_tile_group_sync(); + bsg_cuda_print_stat_kernel_end(); + bsg_fence(); + bsg_barrier_hw_tile_group_sync(); + return; +} + diff --git a/examples/apps/nbody/forces.cpp b/examples/apps/nbody/forces.cpp new file mode 100644 index 000000000..2e43f0f8e --- /dev/null +++ b/examples/apps/nbody/forces.cpp @@ -0,0 +1,167 @@ +#include +#include +#include +#include + +#include +#include +#include + +Point updateForce(Config &cfg, Point delta, float psq, float mass) { + // Computing force += delta * mass * (|delta|^2 + eps^2)^{-3/2} + float idr = 1.0f / sqrtf((float)(psq + cfg.epssq)); + float scale = mass * idr * idr * idr; + return delta * scale; +} + +extern "C" void forces(Config *pcfg, HBOctree *proot, HBBody *HBBodies, int nBodies, unsigned int _diam, int *amocur, int body_end){ + // We can't pass float arguments (technical issue), just + // pointers and integer arguments. + // Copy frequently used data to local + Config cfg = *pcfg; + float diam = *reinterpret_cast(&_diam); + bsg_barrier_hw_tile_group_init(); + bsg_barrier_hw_tile_group_sync(); + bsg_cuda_print_stat_kernel_start(); + bsg_cuda_print_stat_start(1); + // In the x86 version, threads use a stack to do a DFS + // traversal of a tree. We can't do that here because we have + // limited stack space. Instead, we are going to do an + // in-order DFS traversal. Ordering is ensured by storing the + // parent's octant index in the child. When we move up the + // tree we use the octant information in the child to select + // the next octant in the parent node. + + // dsq = cfg.dsq + // While still bodies to process + // Remove body b from work queue, copy to wb + // set cur to root + // octant = 0 + // while cur != root, octant < 8 + // if (cur.isLeaf) + // curd = L1 distance between wb, cur + // curdsq = L2 distance between wb, cur + // b.acc += updateForce(curd, curdsq, b.mass) + // octant = cur.octant + 1 + // cur = cur.pred + // diamsq * 4.0f + // else if (octant == 0) -- octant of 0 means first time we have visited this node + // curd = L1 distance between wb, cur + // curdsq = L2 distance between wb, cur + // if curdsq >= dsq + // b.acc += updateForce(curd, curdsq, b.mass) + // octant = cur.octant + 1 + // cur = cur.pred + // diamsq * 4.0f + // else if (octant < 8) + // if cur.child[octant] != NULL + // cur = cur.child[octant] + // diamsq *= .25f + // else + // octant ++ + // else + // octant = cur.octant + 1 + // cur = cur.pred + // diamsq * 4.0f + + // Work imbalance is a pain. Use an amoadd queue to allocate work dynamically. + int cur; + cur = bsg_amoadd(amocur, 1); + while(cur < body_end){ + // for (int cur = __bsg_id + body_start; cur < body_end; cur += TILE_GROUP_DIM_X * TILE_GROUP_DIM_Y) { + bsg_print_int(cur); + float diamsq = diam * diam * cfg.itolsq; + HBBody *pcurb = &HBBodies[cur]; + HBBody curb = *pcurb; + Point prev_acc = curb.acc; + curb.acc = Point(0.0f, 0.0f, 0.0f); + HBNode *pother = proot; // It is not clear whether copying the other node locally will help. + Point delta; + float distsq; + char octant = 0; + while (pother != proot || (pother == proot && octant < HBOctree::octants)){ + // Leaf status is encoded in the low-order bit of the address. + unsigned int leaf = reinterpret_cast(pother) & 1; + pother = reinterpret_cast(reinterpret_cast(pother) & (~1)); + HBOctree *node = static_cast(pother); + + // child && !(((unsigned int) child) & 1) + if(leaf){ + //bsg_print_float(1.0f); + // Leaf node, compute force + if(pother != static_cast(pcurb)){ + //bsg_print_float(1.1f); + delta = (curb.pos - pother->pos); + distsq = delta.dist2(); + curb.acc += updateForce(cfg, delta, distsq, pother->mass); + } + // Move upward + octant = pother->octant + 1; + pother = pother->pred; + diamsq *= 4.0f; + //bsg_print_hexadecimal((unsigned int)pother); + } else if (octant == 0) { + //bsg_print_float(2.0f); + //bsg_print_hexadecimal((unsigned int)node->child[0]); + // Have not visited this node before + delta = (curb.pos - pother->pos); + distsq = delta.dist2(); + if(distsq >= diamsq){ + //bsg_print_float(2.1f); + // Far away, compute summarized force + curb.acc += updateForce(cfg, delta, distsq, pother->mass); + // Move upward + octant = pother->octant + 1; + pother = pother->pred; + diamsq *= 4.0f; + //bsg_print_hexadecimal((unsigned int)pother); + } else if(node->child[0] != nullptr){ + //bsg_print_float(2.2f); + // Nearby, downward. + pother = node->child[0]; + octant = 0; + diamsq *= .25f; + //bsg_print_hexadecimal((unsigned int)pother); + } else { + //bsg_print_float(2.3f); + // No leaf/body, try next octant + octant ++; + } + } else if (octant < HBOctree::octants) { + //bsg_print_float(3.0f); + if(node->child[octant]) { + //bsg_print_float(3.1f); + // Downward + pother = node->child[octant]; + octant = 0; + diamsq *= .25f; + //bsg_print_hexadecimal((unsigned int)pother); + } else { + //bsg_print_float(3.2f); + // Sideways :p + octant ++; + } + } else { + //bsg_print_float(4.0f); + // Upward + octant = pother->octant + 1; + pother = pother->pred; + diamsq *= 4.0f; + //bsg_print_hexadecimal((unsigned int)pother); + } + } + // Finished traversal + curb.vel += (curb.acc - prev_acc) * cfg.dthf; + HBBodies[cur].vel = curb.vel; + HBBodies[cur].acc = curb.acc; + cur = bsg_amoadd(amocur, 1); + } + + bsg_cuda_print_stat_end(1); + bsg_barrier_hw_tile_group_sync(); + bsg_cuda_print_stat_kernel_end(); + bsg_fence(); + bsg_barrier_hw_tile_group_sync(); + + return; +} diff --git a/examples/apps/nbody/summarize.cpp b/examples/apps/nbody/summarize.cpp new file mode 100644 index 000000000..116e82266 --- /dev/null +++ b/examples/apps/nbody/summarize.cpp @@ -0,0 +1,126 @@ +#include +#include +#include +#include +#include + +// Compute the center of mass for each node in the tree, using the +// centers of mass of it's children. + +// There are several ways to do this. I have implemented a lock-free +// version for the moment. The downside is that this method has +// performance pathologies, where a small number of tiles can end up +// processing a large number of nodes. However, it is simple to write +// and as stated -- lock free. + +extern "C" void summarize(HBNode *root, HBBody *bodies, int nBodies, int *idx){ + + bsg_barrier_hw_tile_group_init(); + bsg_barrier_hw_tile_group_sync(); + bsg_cuda_print_stat_kernel_start(); + bsg_cuda_print_stat_start(1); + bsg_fence(); + + HBNode *cur; + HBOctree *pred; + + // The following code is the lock-free version. Tiles start by + // processing bodies. A body is processed by atomically + // decrementing the number of children in the parent. When + // there are no remaining children to be processed, the node + // that processes the final child processes the parent. If + // there are remaining children to process, the tile processes + // another body. + + // Tiles are assigned bodies by atomically incrementing a + // global index that points to the first unprocessed body. + + int local_idx = __bsg_id; + unsigned int remaining_children = 0; + cur = (HBNode *)&bodies[local_idx]; + // While still bodies to process, or not finished processing + // previous body (local_idx remains unchanged) + while(local_idx < nBodies){ + pred = (HBOctree *)cur->pred; + remaining_children = bsg_amoadd(&pred->nChildren, -1); + // If we are processing the last child, summarize the + // center of mass of the predecessor. + if(remaining_children == 1){ + Point pos; + float mass = 0.0f; + // This can probably be unrolled to increase the number of requests in flight. + for(char octant = 0; octant < HBOctree::octants; octant ++){ + if(pred->child[octant] != nullptr){ + mass += pred->child[octant]->mass; + pos += (pred->child[octant]->pos * pred->child[octant]->mass); + } + } + pred->mass = mass; + pred->pos = pos/mass; + if(pred == root){ + break; + } + else + cur = pred; + } else { + // Otherwise there are children + // remaining. Claim a new body. + local_idx = bsg_amoadd(idx, 1); + cur = (HBNode *)&bodies[local_idx]; + } + } + + // The following comment is the work queue version. It + // optimizes for work efficiency. A double-ended work queue + // primitive is necessary + + // While still nodes to process in work queue + // Remove node from work queue + // Update center of mass for predecessor: + // amofadd(item->pred.mass, item->mass) // We don't have amofadd, just add to lock and read from memory + // amofadd(item->pred.pos, item->mass * item->pos) // Not correct. The position should start from 0, not accumulate to the existing position. + // rem = amoadd(item.pred->nChildren, -1) // Moving before fadd would improve perf. + // if(rem == 0) + // item->pred.pos = item->pred.pos / mass + // add item to work queue + + // The following comment is the "bottom-up" version. It + // focuses on repeated linear traversal through DRAM, taking + // advantage of potential DRAM locality. + + // Host: Set start_ptr to &nodes[0] + // Host: Set end_ptr to &nodes[nbodies-1] + // Set local_start_ptr = bodies[nbodies-1] + // Set local_end_ptr = bodies[0] + // While true + // For each node in (start ptr, finish ptr, sizeof(node)) + // If unprocessed and ready to process + // lock predecessor + // mass = amofadd(item->pred.mass, item->mass) + // pos = amofadd(item->pred.pos, item->mass * item->pos) + // rem = amoadd(item.pred->nChildren, -1) // Moving before fadd would improve perf. + // if(rem == 0) + // item->pred.pos = pos / mass + // amomin.u(end_ptr, predecessor ptr) + // amomax.u(start_ptr, predecessor ptr) + // sync + // if local_start_ptr == root + // break + // fence + // local_start_ptr = start_ptr + // local_end_ptr = end_ptr + // sync + // if origin + // Set start_ptr to &nodes[0] + // Set end_ptr to &nodes[nbodies-1] + // fence + // sync + + bsg_cuda_print_stat_end(1); + bsg_barrier_hw_tile_group_sync(); + bsg_cuda_print_stat_kernel_end(); + bsg_fence(); + bsg_barrier_hw_tile_group_sync(); + return; +} + diff --git a/examples/apps/nbody/update.cpp b/examples/apps/nbody/update.cpp new file mode 100644 index 000000000..3ac6d9f1d --- /dev/null +++ b/examples/apps/nbody/update.cpp @@ -0,0 +1,32 @@ +#include +#include + +#include +#include + + +// HBBodies should be attr_remote, but LLVM complains about the address space cast. +extern "C" void update(Config *pcfg, int nBodies, HBBody *HBBodies){ + + // Copy frequently used data to local + Config cfg = *pcfg; + + bsg_barrier_hw_tile_group_init(); + bsg_barrier_hw_tile_group_sync(); + bsg_cuda_print_stat_kernel_start(); + for (int cur = __bsg_id; cur < nBodies; cur += TILE_GROUP_DIM_X * TILE_GROUP_DIM_Y) { + HBBody b = HBBodies[cur]; + Point dvel(b.acc); + Point velh(b.vel); + dvel *= cfg.dthf; + velh += dvel; + b.pos += velh * cfg.dtime; + b.vel = velh + dvel; + HBBodies[cur] = b; + } + bsg_barrier_hw_tile_group_sync(); + bsg_cuda_print_stat_kernel_end(); + bsg_fence(); + bsg_barrier_hw_tile_group_sync(); + return; +}