Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jacobi #779

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions examples/cuda/test_jacobi/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# Copyright (c) 2021, University of Washington All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice, this list
# of conditions and the following disclaimer.
#
# Redistributions in binary form must reproduce the above copyright notice, this
# list of conditions and the following disclaimer in the documentation and/or
# other materials provided with the distribution.
#
# Neither the name of the copyright holder nor the names of its contributors may
# be used to endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# This Makefile compiles, links, and executes examples Run `make help`
# to see the available targets for the selected platform.

################################################################################
# environment.mk verifies the build environment and sets the following
# makefile variables:
#
# LIBRAIRES_PATH: The path to the libraries directory
# HARDWARE_PATH: The path to the hardware directory
# EXAMPLES_PATH: The path to the examples directory
# BASEJUMP_STL_DIR: Path to a clone of BaseJump STL
# BSG_MANYCORE_DIR: Path to a clone of BSG Manycore
###############################################################################

REPLICANT_PATH:=$(shell git rev-parse --show-toplevel)

include $(REPLICANT_PATH)/environment.mk
SPMD_SRC_PATH = $(BSG_MANYCORE_DIR)/software/spmd

# KERNEL_NAME is the name of the CUDA-Lite Kernel
KERNEL_NAME = jacobi

# Tile Group Dimensions
TILE_GROUP_DIM_X = 16
TILE_GROUP_DIM_Y = 8

###############################################################################
# Host code compilation flags and flow
###############################################################################

# TEST_SOURCES is a list of source files that need to be compiled
TEST_SOURCES = main.c

DEFINES += -D_XOPEN_SOURCE=500 -D_BSD_SOURCE -D_DEFAULT_SOURCE
DEFINES += -DTILE_GROUP_DIM_X=$(TILE_GROUP_DIM_X)
DEFINES += -DTILE_GROUP_DIM_Y=$(TILE_GROUP_DIM_Y)
CDEFINES +=
CXXDEFINES +=

FLAGS = -g -Wall -Wno-unused-function -Wno-unused-variable
CFLAGS += -std=c99 $(FLAGS)
CXXFLAGS += -std=c++11 $(FLAGS)

# compilation.mk defines rules for compilation of C/C++
include $(EXAMPLES_PATH)/compilation.mk

###############################################################################
# Host code link flags and flow
###############################################################################

LDFLAGS +=

# link.mk defines rules for linking of the final execution binary.
include $(EXAMPLES_PATH)/link.mk

###############################################################################
# Device code compilation flow
###############################################################################

# BSG_MANYCORE_KERNELS is a list of manycore executables that should
# be built before executing.
BSG_MANYCORE_KERNELS = kernel.riscv

compute.rvo: RISCV_CC = $(RISCV_CLANG)
kernel.riscv: kernel.rvo compute.rvo

RISCV_DEFINES += -Dbsg_tiles_X=$(TILE_GROUP_DIM_X)
RISCV_DEFINES += -Dbsg_tiles_Y=$(TILE_GROUP_DIM_Y)
RISCV_CCXXFLAGS += -O3
#compute.rvo: RISCV_CFLAGS += -flto
RISCV_LDFLAGS += -flto

include $(EXAMPLES_PATH)/cuda/riscv.mk

###############################################################################
# Execution flow
#
# C_ARGS: Use this to pass arguments that you want to appear in argv
# For SPMD tests C arguments are: <Path to RISC-V Binary> <Test Name>
#
# SIM_ARGS: Use this to pass arguments to the simulator
###############################################################################
C_ARGS ?= $(BSG_MANYCORE_KERNELS) $(KERNEL_NAME)

SIM_ARGS ?=

# Include platform-specific execution rules
include $(EXAMPLES_PATH)/execution.mk

###############################################################################
# Regression Flow
###############################################################################

regression: exec.log
@grep "BSG REGRESSION TEST .*PASSED.*" $< > /dev/null

.DEFAULT_GOAL := help

.PHONY: clean

clean:
rm -rf *.ld

87 changes: 87 additions & 0 deletions examples/cuda/test_jacobi/compute.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#include <bsg_manycore.h>
#include "bsg_cuda_lite_barrier.h"
#include <stdbool.h>
#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
// copy 64 elements along X axis
void copyXAxis64(float bsg_attr_remote * bsg_attr_noalias src, float bsg_attr_remote * bsg_attr_noalias dst) {
for (int i = 0; i < 4; i++) {
float tmp00 = src[0];
float tmp01 = src[1];
float tmp02 = src[2];
float tmp03 = src[3];
float tmp04 = src[4];
float tmp05 = src[5];
float tmp06 = src[6];
float tmp07 = src[7];
float tmp08 = src[8];
float tmp09 = src[9];
float tmp10 = src[10];
float tmp11 = src[11];
float tmp12 = src[12];
float tmp13 = src[13];
float tmp14 = src[14];
float tmp15 = src[15];
asm volatile("": : :"memory");
dst[0] = tmp00;
dst[1] = tmp01;
dst[2] = tmp02;
dst[3] = tmp03;
dst[4] = tmp04;
dst[5] = tmp05;
dst[6] = tmp06;
dst[7] = tmp07;
dst[8] = tmp08;
dst[9] = tmp09;
dst[10] = tmp10;
dst[11] = tmp11;
dst[12] = tmp12;
dst[13] = tmp13;
dst[14] = tmp14;
dst[15] = tmp15;
dst += 16;
src += 16;
}
return;
}


void compute(int c0, int c1, float bsg_attr_remote * bsg_attr_noalias A0, float bsg_attr_remote * bsg_attr_noalias Anext, float bsg_attr_remote * bsg_attr_noalias a_left, float bsg_attr_remote * bsg_attr_noalias a_right, float bsg_attr_remote * bsg_attr_noalias a_up, float bsg_attr_remote * bsg_attr_noalias a_down, float bsg_attr_remote * bsg_attr_noalias a_self, bool x_l_bound, bool x_h_bound, bool y_l_bound, bool y_h_bound, const int nx, const int ny, const int nz, const int j, const int k){
for (int ii = 1; ii < nx-1; ii += 62) {

// Inital load -- we load 64 and produce 62
if (x_l_bound) {
copyXAxis64(&(A0[Index3D (nx, ny, ii-1, j-1, k)]), a_left);
}
if (x_h_bound) {
copyXAxis64(&(A0[Index3D (nx, ny, ii-1, j+1, k)]), a_right);
}
if (y_l_bound) {
copyXAxis64(&(A0[Index3D (nx, ny, ii-1, j, k-1)]), a_up);
}
if (y_h_bound) {
copyXAxis64(&(A0[Index3D (nx, ny, ii-1, j, k+1)]), a_down);
}

copyXAxis64(&(A0[Index3D (nx, ny, ii-1, j, k)]), a_self);
bsg_barrier_hw_tile_group_sync();

bsg_unroll(8)
for (int i = 1; i < 63; i++) {
// Load top
// top = A0[Index3D (nx, ny, i+1, j, k)];
float top = a_self[i+1];
float bottom = a_self[i-1];

float left = a_left[i];
float right = a_right[i];
float up = a_up[i];
float down = a_down[i];
float self = a_self[i];

// Jacobi
float next = (top + bottom + left + right + up + down) * c1 - self * c0;
Anext[Index3D (nx, ny, ii-1+i, j, k)] = next;
}
bsg_barrier_hw_tile_group_sync();
}
}
79 changes: 79 additions & 0 deletions examples/cuda/test_jacobi/kernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
//This kernel adds 2 vectors

#define BSG_TILE_GROUP_X_DIM bsg_tiles_X
#define BSG_TILE_GROUP_Y_DIM bsg_tiles_Y
#include <math.h>
#include "bsg_manycore.h"
#include "bsg_set_tile_x_y.h"
#include "bsg_group_strider.hpp"
#include "bsg_cuda_lite_barrier.h"

extern "C" void compute(int c0, int c1, float * bsg_attr_noalias A0, float * bsg_attr_noalias Anext, float * bsg_attr_noalias a_left, float * bsg_attr_noalias a_right, float * bsg_attr_noalias a_up, float * bsg_attr_noalias a_down, float * bsg_attr_noalias a_self, bool x_l_bound, bool x_h_bound, bool y_l_bound, bool y_h_bound, const int nx, const int ny, const int nz, const int j, const int k);

extern "C" __attribute__ ((noinline)) __attribute__((used))
int kernel_jacobi(int c0, int c1, float *A0, float * Anext,
const int nx, const int ny, const int nz) {

const bool x_l_bound = (__bsg_x == 0);
const bool x_h_bound = (__bsg_x == (bsg_tiles_X-1));
const bool y_l_bound = (__bsg_y == 0);
const bool y_h_bound = (__bsg_y == (bsg_tiles_Y-1));
const int j = __bsg_x + 1;
const int k = __bsg_y + 1;

bsg_nonsynth_saif_start();
bsg_barrier_hw_tile_group_init();
bsg_fence();
bsg_barrier_hw_tile_group_sync();
bsg_cuda_print_stat_kernel_start();

// Calculate 2D XY distribution. One output per tile (temp).
// Idea - unroll Z-axis (k). By 64, which is the input size

// Check if additional load from DRAM is necessary

// Buffer for A0
float a_self[64] = {0.0f};

// Auxillary buffers
float aux_left[64];
float aux_right[64];
float aux_up[64];
float aux_down[64];

// Construct remote pointers
float* a_up, *a_down, *a_left, *a_right;

if (x_l_bound) {
a_left = aux_left;
} else {
bsg_tile_group_strider<BSG_TILE_GROUP_X_DIM, 0, BSG_TILE_GROUP_Y_DIM, 0, float> r_left(a_self, __bsg_x-1, __bsg_y);
a_left = r_left.ptr;
}
if (x_h_bound) {
a_right = aux_right;
} else {
bsg_tile_group_strider<BSG_TILE_GROUP_X_DIM, 0, BSG_TILE_GROUP_Y_DIM, 0, float> r_right(a_self, __bsg_x+1, __bsg_y);
a_right = r_right.ptr;
}
if (y_l_bound) {
a_up = aux_up;
} else {
bsg_tile_group_strider<BSG_TILE_GROUP_X_DIM, 0, BSG_TILE_GROUP_Y_DIM, 0, float> r_up(a_self, __bsg_x, __bsg_y-1);
a_up = r_up.ptr;
}
if (y_h_bound) {
a_down = aux_down;
} else {
bsg_tile_group_strider<BSG_TILE_GROUP_X_DIM, 0, BSG_TILE_GROUP_Y_DIM, 0, float> r_down(a_self, __bsg_x, __bsg_y+1);
a_down = r_down.ptr;
}

compute(c0, c1, A0, Anext, a_left, a_right, a_up, a_down, a_self, x_l_bound, x_h_bound, y_l_bound, y_h_bound, nx, ny, nz, j, k);

bsg_cuda_print_stat_kernel_end();
bsg_fence();
bsg_barrier_hw_tile_group_sync();
bsg_nonsynth_saif_end();
return 0;
}
Loading