bespoke-silicon-group · yodada · Mar 29, 2022 · Mar 29, 2022 · Mar 29, 2022 · Mar 31, 2022
diff --git a/examples/cuda/test_jacobi/Makefile b/examples/cuda/test_jacobi/Makefile
@@ -0,0 +1,130 @@
+# Copyright (c) 2021, University of Washington All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# Redistributions of source code must retain the above copyright notice, this list
+# of conditions and the following disclaimer.
+#
+# Redistributions in binary form must reproduce the above copyright notice, this
+# list of conditions and the following disclaimer in the documentation and/or
+# other materials provided with the distribution.
+#
+# Neither the name of the copyright holder nor the names of its contributors may
+# be used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# This Makefile compiles, links, and executes examples Run `make help`
+# to see the available targets for the selected platform.
+
+################################################################################
+# environment.mk verifies the build environment and sets the following
+# makefile variables:
+#
+# LIBRAIRES_PATH: The path to the libraries directory
+# HARDWARE_PATH: The path to the hardware directory
+# EXAMPLES_PATH: The path to the examples directory
+# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL
+# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore
+###############################################################################
+
+REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)
+
+include $(REPLICANT_PATH)/environment.mk
+SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd
+
+# KERNEL_NAME is the name of the CUDA-Lite Kernel
+KERNEL_NAME = jacobi
+
+# Tile Group Dimensions
+TILE_GROUP_DIM_X = 16
+TILE_GROUP_DIM_Y = 8
+
+###############################################################################
+# Host code compilation flags and flow
+###############################################################################
+
+# TEST_SOURCES is a list of source files that need to be compiled
+TEST_SOURCES = main.c
+
+DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE -D_DEFAULT_SOURCE
+DEFINES += -DTILE_GROUP_DIM_X=$(TILE_GROUP_DIM_X)
+DEFINES += -DTILE_GROUP_DIM_Y=$(TILE_GROUP_DIM_Y)
+CDEFINES +=
+CXXDEFINES +=
+
+FLAGS     = -g -Wall -Wno-unused-function -Wno-unused-variable
+CFLAGS   += -std=c99 $(FLAGS)
+CXXFLAGS += -std=c++11 $(FLAGS)
+
+# compilation.mk defines rules for compilation of C/C++
+include $(EXAMPLES_PATH)/compilation.mk
+
+###############################################################################
+# Host code link flags and flow
+###############################################################################
+
+LDFLAGS +=
+
+# link.mk defines rules for linking of the final execution binary.
+include $(EXAMPLES_PATH)/link.mk
+
+###############################################################################
+# Device code compilation flow
+###############################################################################
+
+# BSG_MANYCORE_KERNELS is a list of manycore executables that should
+# be built before executing.
+BSG_MANYCORE_KERNELS = kernel.riscv
+
+compute.rvo: RISCV_CC = $(RISCV_CLANG)
+kernel.riscv: kernel.rvo compute.rvo
+
+RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X)
+RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y)
+RISCV_CCXXFLAGS += -O3
+#compute.rvo: RISCV_CFLAGS += -flto
+RISCV_LDFLAGS += -flto
+
+include $(EXAMPLES_PATH)/cuda/riscv.mk
+
+###############################################################################
+# Execution flow
+#
+# C_ARGS: Use this to pass arguments that you want to appear in argv
+#         For SPMD tests C arguments are: <Path to RISC-V Binary> <Test Name>
+#
+# SIM_ARGS: Use this to pass arguments to the simulator
+###############################################################################
+C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME)
+
+SIM_ARGS ?=
+
+# Include platform-specific execution rules
+include $(EXAMPLES_PATH)/execution.mk
+
+###############################################################################
+# Regression Flow
+###############################################################################
+
+regression: exec.log
+	@grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null
+
+.DEFAULT_GOAL := help
+
+.PHONY: clean
+
+clean:
+	rm -rf *.ld
+
diff --git a/examples/cuda/test_jacobi/compute.c b/examples/cuda/test_jacobi/compute.c
@@ -0,0 +1,87 @@
+#include <bsg_manycore.h>
+#include "bsg_cuda_lite_barrier.h"
+#include <stdbool.h>
+#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+// copy 64 elements along X axis
+void copyXAxis64(float bsg_attr_remote * bsg_attr_noalias src, float bsg_attr_remote * bsg_attr_noalias dst) {
+  for (int i = 0; i < 4; i++) {
+    float tmp00 =  src[0];
+    float tmp01 =  src[1];
+    float tmp02 =  src[2];
+    float tmp03 =  src[3];
+    float tmp04 =  src[4];
+    float tmp05 =  src[5];
+    float tmp06 =  src[6];
+    float tmp07 =  src[7];
+    float tmp08 =  src[8];
+    float tmp09 =  src[9];
+    float tmp10 = src[10];
+    float tmp11 = src[11];
+    float tmp12 = src[12];
+    float tmp13 = src[13];
+    float tmp14 = src[14];
+    float tmp15 = src[15];
+    asm volatile("": : :"memory");
+     dst[0] = tmp00;
+     dst[1] = tmp01;
+     dst[2] = tmp02;
+     dst[3] = tmp03;
+     dst[4] = tmp04;
+     dst[5] = tmp05;
+     dst[6] = tmp06;
+     dst[7] = tmp07;
+     dst[8] = tmp08;
+     dst[9] = tmp09;
+    dst[10] = tmp10;
+    dst[11] = tmp11;
+    dst[12] = tmp12;
+    dst[13] = tmp13;
+    dst[14] = tmp14;
+    dst[15] = tmp15;
+    dst += 16;
+    src += 16;
+  }
+  return;
+}
+
+
+void compute(int c0, int c1, float bsg_attr_remote * bsg_attr_noalias A0, float bsg_attr_remote * bsg_attr_noalias Anext, float bsg_attr_remote * bsg_attr_noalias a_left, float bsg_attr_remote * bsg_attr_noalias a_right, float bsg_attr_remote * bsg_attr_noalias a_up, float bsg_attr_remote * bsg_attr_noalias a_down, float bsg_attr_remote * bsg_attr_noalias a_self, bool x_l_bound, bool x_h_bound, bool y_l_bound, bool y_h_bound, const int nx, const int ny, const int nz, const int j, const int k){
+  for (int ii = 1; ii < nx-1; ii += 62) {
+
+    // Inital load -- we load 64 and produce 62
+    if (x_l_bound) {
+      copyXAxis64(&(A0[Index3D (nx, ny, ii-1, j-1, k)]), a_left);
+    }
+    if (x_h_bound) {
+      copyXAxis64(&(A0[Index3D (nx, ny, ii-1, j+1, k)]), a_right);
+    }
+    if (y_l_bound) {
+      copyXAxis64(&(A0[Index3D (nx, ny, ii-1, j, k-1)]), a_up);
+    }
+    if (y_h_bound) {
+      copyXAxis64(&(A0[Index3D (nx, ny, ii-1, j, k+1)]), a_down);
+    }
+
+    copyXAxis64(&(A0[Index3D (nx, ny, ii-1, j, k)]), a_self);
+    bsg_barrier_hw_tile_group_sync();
+
+    bsg_unroll(8)
+    for (int i = 1; i < 63; i++) {
+      // Load top
+      // top = A0[Index3D (nx, ny, i+1, j, k)];
+      float    top = a_self[i+1];
+      float bottom = a_self[i-1];
+
+      float left  = a_left[i];
+      float right = a_right[i];
+      float    up = a_up[i];
+      float  down = a_down[i];
+      float self = a_self[i];
+
+      // Jacobi
+      float next = (top + bottom + left + right + up + down) * c1 - self * c0;
+      Anext[Index3D (nx, ny, ii-1+i, j, k)] = next;
+    }
+    bsg_barrier_hw_tile_group_sync();
+  }
+}
diff --git a/examples/cuda/test_jacobi/kernel.cpp b/examples/cuda/test_jacobi/kernel.cpp
@@ -0,0 +1,79 @@
+//This kernel adds 2 vectors
+
+#define BSG_TILE_GROUP_X_DIM bsg_tiles_X
+#define BSG_TILE_GROUP_Y_DIM bsg_tiles_Y
+#include <math.h>
+#include "bsg_manycore.h"
+#include "bsg_set_tile_x_y.h"
+#include "bsg_group_strider.hpp"
+#include "bsg_cuda_lite_barrier.h"
+
+extern "C" void compute(int c0, int c1, float * bsg_attr_noalias A0, float * bsg_attr_noalias Anext, float * bsg_attr_noalias a_left, float * bsg_attr_noalias a_right, float * bsg_attr_noalias a_up, float * bsg_attr_noalias a_down, float * bsg_attr_noalias a_self, bool x_l_bound, bool x_h_bound, bool y_l_bound, bool y_h_bound, const int nx, const int ny, const int nz, const int j, const int k);
+
+extern "C" __attribute__ ((noinline)) __attribute__((used))
+int kernel_jacobi(int c0, int c1, float *A0, float * Anext,
+                  const int nx, const int ny, const int nz) {
+
+  const bool x_l_bound = (__bsg_x == 0);
+  const bool x_h_bound = (__bsg_x == (bsg_tiles_X-1));
+  const bool y_l_bound = (__bsg_y == 0);
+  const bool y_h_bound = (__bsg_y == (bsg_tiles_Y-1));
+  const int j = __bsg_x + 1;
+  const int k = __bsg_y + 1;
+
+        bsg_nonsynth_saif_start();
+  bsg_barrier_hw_tile_group_init();
+  bsg_fence();
+  bsg_barrier_hw_tile_group_sync();
+  bsg_cuda_print_stat_kernel_start();
+
+  // Calculate 2D XY distribution. One output per tile (temp).
+  // Idea - unroll Z-axis (k). By 64, which is the input size
+
+  // Check if additional load from DRAM is necessary
+
+  // Buffer for A0
+  float a_self[64] = {0.0f};
+
+  // Auxillary buffers
+  float aux_left[64];
+  float aux_right[64];
+  float aux_up[64];
+  float aux_down[64];
+
+  // Construct remote pointers
+  float* a_up, *a_down, *a_left, *a_right;
+
+  if (x_l_bound) {
+    a_left = aux_left;
+  } else {
+    bsg_tile_group_strider<BSG_TILE_GROUP_X_DIM, 0, BSG_TILE_GROUP_Y_DIM, 0, float> r_left(a_self,  __bsg_x-1, __bsg_y);
+    a_left = r_left.ptr;
+  }
+  if (x_h_bound) {
+    a_right = aux_right;
+  } else {
+    bsg_tile_group_strider<BSG_TILE_GROUP_X_DIM, 0, BSG_TILE_GROUP_Y_DIM, 0, float> r_right(a_self,  __bsg_x+1, __bsg_y);
+    a_right = r_right.ptr;
+  }
+  if (y_l_bound) {
+    a_up = aux_up;
+  } else {
+    bsg_tile_group_strider<BSG_TILE_GROUP_X_DIM, 0, BSG_TILE_GROUP_Y_DIM, 0, float> r_up(a_self,  __bsg_x, __bsg_y-1);
+    a_up = r_up.ptr;
+  }
+  if (y_h_bound) {
+    a_down = aux_down;
+  } else {
+    bsg_tile_group_strider<BSG_TILE_GROUP_X_DIM, 0, BSG_TILE_GROUP_Y_DIM, 0, float> r_down(a_self,  __bsg_x, __bsg_y+1);
+    a_down = r_down.ptr;
+  }
+
+  compute(c0, c1, A0, Anext, a_left, a_right, a_up, a_down, a_self, x_l_bound, x_h_bound, y_l_bound, y_h_bound, nx, ny, nz, j, k);
+
+  bsg_cuda_print_stat_kernel_end();
+  bsg_fence();
+  bsg_barrier_hw_tile_group_sync();
+        bsg_nonsynth_saif_end();
+	return 0;
+}