Skip to content

Commit

Permalink
gaussian
Browse files Browse the repository at this point in the history
  • Loading branch information
jk78346 committed Jul 29, 2021
1 parent 02350e2 commit b6ce613
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 82 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ simple.o:

run:
./app/pkgs/apps/blackscholes/src/blackscholes 1 ./app/pkgs/apps/blackscholes/src/input_1024x1024.txt ./app/pkgs/apps/blackscholes/src/out.txt
./app/gaussian-elimination-pthreads-openmp/openmp.out 1024 1 1
./app/gaussian-elimination-pthreads-openmp/openmp.out 16 1 1
./app/rodinia_3.1/openmp/lud/omp/lud_omp -s 1024
./app/rodinia_3.1/openmp/backprop/backprop 1024
./obj/openctpu
Expand Down
3 changes: 2 additions & 1 deletion app/gaussian-elimination-pthreads-openmp/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
CC:=g++ --std=c++11 -O3 -g -lgptpu
LD=-lgptpu -ldl -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -lopenblas -lpthread

program: gauss.c threads_gauss.c chunk_threads_gauss.c openmp_gauss.c chunk_thpool_gauss.c thpool_gauss.c thpool.c

Expand All @@ -8,7 +9,7 @@ program: gauss.c threads_gauss.c chunk_threads_gauss.c openmp_gauss.c chunk_thpo

# $(CC) chunk_threads_gauss.c -pthread -o chunkthreads.out

$(CC) openmp_gauss.c -fopenmp -o openmp.out
$(CC) openmp_gauss.c -fopenmp $(LD) -o openmp.out

# $(CC) chunk_thpool_gauss.c thpool.c -pthread -o chunkpool.out

Expand Down
10 changes: 5 additions & 5 deletions app/gaussian-elimination-pthreads-openmp/gauss.c
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,10 @@ int main(int argc, char **argv) {
initialize_inputs();

/* Print input matrices */
// print_inputs();
print_inputs();

/* Start Clock */
// printf("\nStarting clock.\n");
printf("\nStarting clock.\n");
gettimeofday(&etstart, &tzdummy);
etstart2 = times(&cputstart);

Expand All @@ -143,7 +143,7 @@ int main(int argc, char **argv) {
/* Stop Clock */
gettimeofday(&etstop, &tzdummy);
etstop2 = times(&cputstop);
// printf("Stopped clock.\n");
printf("Stopped clock.\n");
usecstart = (unsigned long long)etstart.tv_sec * 1000000 + etstart.tv_usec;
usecstop = (unsigned long long)etstop.tv_sec * 1000000 + etstop.tv_usec;

Expand All @@ -168,7 +168,7 @@ int main(int argc, char **argv) {
(cputstart.tms_cutime + cputstart.tms_cstime) ) /
(float)CLOCKS_PER_SEC * 1000);
/* Contrary to the man pages, this appears not to include the parent */
// printf("--------------------------------------------\n");
printf("--------------------------------------------\n");

exit(0);
}
Expand All @@ -184,7 +184,7 @@ void gauss() {
* element row and col */
float multiplier;

// printf("Computing Serially.\n");
printf("Computing Serially.\n");

/* Gaussian elimination */
for (norm = 0; norm < N - 1; norm++) {
Expand Down
156 changes: 81 additions & 75 deletions app/gaussian-elimination-pthreads-openmp/openmp_gauss.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,12 @@ void parameters(int argc, char **argv) {
if (argc == 4) {
seed = atoi(argv[3]);
srand(seed);
// printf("Random seed = %i\n", seed);
printf("Random seed = %i\n", seed);
}
if (argc >= 3) {
seed = atoi(argv[2]);
NumThreads = seed;
// printf("# of open threads = %i\n", seed);
printf("# of open threads = %i\n", seed);
}
if (argc >= 2) {
N = atoi(argv[1]);
Expand All @@ -91,20 +91,6 @@ void parameters(int argc, char **argv) {
printf("\nMatrix dimension N = %i.\n", N);
}

/* Initialize A and B (and X to 0.0s) */
void initialize_inputs() {
int row, col;

// printf("\nInitializing...\n");
for (col = 0; col < N; col++) {
for (row = 0; row < N; row++) {
cpu_A[row][col] = tpu_A[row][col] = (rand() / (float)RAND_MAX) * 65536;//2147483648; // rand()%10;//(float)rand() / 32768.0;
}
cpu_B[col] = tpu_B[col] = (rand() / (float)RAND_MAX) * 65536;//2147483648;// rand()%10;//(float)rand() / 32768.0;
cpu_X[col] = tpu_X[col] = 0.0;
}

}

/* Print input matrices */
void print_inputs(float** A, float* B) {
Expand All @@ -131,26 +117,43 @@ void print_X(float* X) {
printf("%5.2f%s", X[row], (row < N-1) ? "; " : "]\n");
}
}
/* Initialize A and B (and X to 0.0s) */
void initialize_inputs() {
int row, col;
printf("\nInitializing...\n");
for (row = 0; row < N; row++) {
for (col = 0; col < N; col++) {
if(row == 0 || col == 0){
cpu_A[row][col] = tpu_A[row][col] = 1;
}else{
cpu_A[row][col] = tpu_A[row][col] = cpu_A[row-1][col-1] + cpu_A[row-1][col];
}
}
cpu_B[row] = tpu_B[row] = cpu_A[row][N-1];
cpu_X[row] = tpu_X[row] = 0.0;
}
// exit(0);
}

void compare_X(float* cpu_X, float* tpu_X){
int cnt = 0;
for(int i = 0 ; i < N ; i++){
if(abs(cpu_X[i] - tpu_X[i]) > 1e-5){
cnt += 1;
// if(cnt < 10){
// printf("wrong: cpu_X[%d]: %f | tpu_X[%d]: %f\n", i, cpu_X[i], i , tpu_X[i]);
// }
if(cnt < 10){
printf("wrong: cpu_X[%d]: %f | tpu_X[%d]: %f\n", i, cpu_X[i], i , tpu_X[i]);
}
}else{
// if(i < 10){
// printf("cpu_X[%d]: %f | tpu_X[%d]: %f\n", i, cpu_X[i], i , tpu_X[i]);
// }
if(i < 10){
printf("cpu_X[%d]: %f | tpu_X[%d]: %f\n", i, cpu_X[i], i , tpu_X[i]);
}
}
}
if(cnt == 0){
printf("Verify pass!\n");
}else{
printf("Verify fail, (%d/%d)\n", cnt, N);
}
// if(cnt == 0){
// printf("Verify pass!\n");
// }else{
// printf("Verify fail, (%d/%d)\n", cnt, N);
// }

double avg = 0;
double rate = 0;
Expand All @@ -164,6 +167,13 @@ void compare_X(float* cpu_X, float* tpu_X){
std::cout << "RMSE: " << RMSE << ", blas_c avg: " << avg << ", RMSE pecentage: " << (RMSE/avg)*100 << "%" << ", error rate: " << (rate/avg)*100 << "%" << std::endl;

}
void mul(int* rows_int, int* mf_int, int* rows_int2, int A, int B){
for(int i = 0 ; i < A ; i++){
for(int j= 0 ; j < B ; j++){
rows_int[i*B+j] = mf_int[i*B+j] * rows_int2[i*B+j];
}
}
}

int main(int argc, char **argv) {
/* Timing variables */
Expand Down Expand Up @@ -191,25 +201,27 @@ int main(int argc, char **argv) {
initialize_inputs();

/* Print input matrices */
// print_inputs(cpu_A, cpu_B);
print_inputs(cpu_A, cpu_B);

/* Start Clock */
// printf("\nStarting clock.\n");
printf("\nStarting clock.\n");
gettimeofday(&etstart, &tzdummy);
etstart2 = times(&cputstart);

/* Gaussian Elimination */
gauss();
print_X(cpu_X);
gauss_tpu();
/* Stop Clock */
gettimeofday(&etstop, &tzdummy);
etstop2 = times(&cputstop);
// printf("Stopped clock.\n");
printf("Stopped clock.\n");
usecstart = (unsigned long long)etstart.tv_sec * 1000000 + etstart.tv_usec;
usecstop = (unsigned long long)etstop.tv_sec * 1000000 + etstop.tv_usec;

/* Compare output */
compare_X(cpu_X, tpu_X);
//compare_X(cpu_B, tpu_B);

/* Display timing results */
printf("\nElapsed time = %g ms.\n",
Expand Down Expand Up @@ -262,12 +274,15 @@ void gauss() {
// printf("B[%d] -= B[%d] * m (norm=%d)\n", row, norm, norm);
cpu_B[row] -= cpu_B[norm] * multiplier;
}
// for(int i = 0 ; i < N ; i++){
// printf("B[%d]= %f (norm=%d)\n", i, cpu_B[i], norm);
// }
}
timing f_e = clk::now();
/* (Diagonal elements are not normalized to 1. This is treated in back
* substitution.)
*/

print_inputs(cpu_A, cpu_B);
// printf("print A and B before back subsitution\n");
// print_inputs(cpu_A, cpu_B);

Expand Down Expand Up @@ -312,32 +327,33 @@ void gauss_tpu() {
float RMAX = FLT_MIN, RMIN = FLT_MAX;
int* rows_int = (int*) malloc((N+1)*(N+1)*sizeof(int));
int* mf_int = (int*) malloc((N+1)*(N+1)*sizeof(int));
// for (norm = 0; norm < N - 1; norm++) {
// for (row = norm + 1; row < N; row++) {
// multiplier = cpu_A[row][norm] / cpu_A[norm][norm];
// for (col = norm; col < N; col++) {
// cpu_A[row][col] -= cpu_A[norm][col] * multiplier;
// }
// cpu_B[row] -= cpu_B[norm] * multiplier;
// }
// }


for (norm = 0; norm < N - 1; norm++) {
// #pragma omp parallel for shared(A, B) private(multiplier,row,col)
// TODO: prepare matrix: [rows], and matrix: [mutipliers]
mfs = clk::now();
for(int i = 0 ; i < (N-norm+1); i++){
multiplier = tpu_A[i+norm+1][norm] / tpu_A[norm][norm];
for(int j = 0 ; j < (N-norm+1) ; j++){
mf[i*(N-norm+1)+j] = multiplier;// same for each row
}
}
mfe = clk::now();
mfus += std::chrono::duration_cast<std::chrono::nanoseconds>(mfe-mfs).count()/1000.0;
rs = clk::now();
for(int i = 0 ; i < (N-norm-1) ; i++){
for(int j = 0 ; j < (N-norm) ; j++){
rows[i*(N-norm+1)+j] = tpu_A[norm][j+norm];
}
rows[i*(N-norm+1)+N-norm] = tpu_B[norm];
}
re = clk::now();
rus += std::chrono::duration_cast<std::chrono::nanoseconds>(re-rs).count()/1000.0;
// ========== the gptpu_mul kernel ==========
// printf("for norm = %d, (%d)*(%d) = (%d)\n", norm, (N-norm-1), (N-norm+1), (N-norm-1)*(N-norm+1));
ks = clk::now();
// gptpu_mul(row, mf, row, (N-norm-1), (N-norm+1));
// printf("mul shape: %d, %d\n", N-norm-1, N-norm+1);
//gptpu_mul(rows_int, mf_int, rows_int, (N-norm-1), (N-norm+1));
// mul(rows_int, mf_int, rows_int, (N-norm-1), (N-norm+1));
// ====== quantize to uint8 =============================
MMAX = FLT_MIN, MMIN = FLT_MAX;
RMAX = FLT_MIN, RMIN = FLT_MAX;
Expand All @@ -350,64 +366,54 @@ void gauss_tpu() {
for(int i = 0 ; i < (N-norm-1)*(N-norm+1) ; i++){
rows_int[i] = (int)((rows[i] / RMAX) * 255);
mf_int[i] = (int)((mf[i] / MMAX) * 255);
// printf("rows_int: %d, mf_int: %d, ", rows_int[i], mf_int[i]);
// printf("RMAX: %f, RMIN: %f, MMAX: %f, MMIN: %f, norm: %d\n", RMAX, RMIN, MMAX, MMIN, norm);
// if(RMIN < 0){printf("i: %d, rows[i]: %f\n", i, rows[i]); getchar();}
}
for(int i = 0 ; i < (N-norm-1)*(N-norm+1) ; i++){
//rows[i] *= mf[i];
rows[i] = rows_int[i] * mf_int[i];
}
for(int i = 0 ; i < (N-norm-1)*(N-norm+1) ; i++){
rows[i] = rows[i] * (RMAX / 255) * (MMAX / 255);
// printf("i: %d, rows[i]: %f\n", i, rows[i]);
}
// mul(rows_int, mf_int, rows_int, (N-norm-1), (N-norm+1));
for(int i = 0 ; i < (N-norm-1); i++){
for(int j = 0 ; j < (N-norm+1); j++){
rows[i*(N-norm+1)+j] = mf[i*(N-norm+1)+j] * rows[i*(N-norm+1)+j];
}
}
// ====== end quantize ==================================
// printf("length = %4d * %4d = %8d\n", N-norm-1, N-norm+1, (N-norm-1)*(N-norm+1));
cnt += (N-norm-1)*(N+norm+1);
ke = clk::now();
kus += std::chrono::duration_cast<std::chrono::nanoseconds>(ke-ks).count()/1000.0;
// ========== write back ==========
ws = clk::now();
for(int i = 0 ; i < (N-norm-1) ; i++){
for(int j = 0 ; j < (N-norm) ; j++){
tpu_A[i+norm+1][j+norm] -= rows[i*(N-norm+1)+j];
}
tpu_B[i+norm+1] -= rows[i*(N-norm+1)+N-norm];
}
we = clk::now();
wus += std::chrono::duration_cast<std::chrono::nanoseconds>(we-ws).count()/1000.0;
// for (row = norm + 1; row < N; row++) {
// multiplier = A[row][norm] / A[norm][norm];
// for (col = norm; col < N; col++) {
// A[row][col] -= A[norm][col] * multiplier;
// }
// B[row] -= B[norm] * multiplier;
// }
}
timing f_e = clk::now();
// printf("print A and B before back subsitution\n");
// print_inputs(tpu_A, tpu_B);
/* Back substitution */
timing b_s = clk::now();
for (row = N - 1; row >= 0; row--) {
tpu_X[row] = tpu_B[row];
for (col = N-1; col > row; col--) {
tpu_X[row] -= tpu_A[row][col] * tpu_X[col];
}
tpu_X[row] /= tpu_A[row][row];
}
timing b_e = clk::now();
double f_us = std::chrono::duration_cast<std::chrono::nanoseconds>(f_e-f_s).count()/1000.0;
double b_us = std::chrono::duration_cast<std::chrono::nanoseconds>(b_e-b_s).count()/1000.0;
//double f_us = std::chrono::duration_cast<std::chrono::nanoseconds>(f_e-f_s).count()/1000.0;
//double b_us = std::chrono::duration_cast<std::chrono::nanoseconds>(b_e-b_s).count()/1000.0;
// print_X(tpu_X);
printf("cnt = %lld\n", cnt);
printf("forward time: %12.3f (us)\n", f_us);
printf("->mf time: %12.3f (us)\n", mfus);
printf("->rows time: %12.3f (us)\n", rus);
printf("->kernel time: %12.3f (us)\n", kus);
printf("->WB time: %12.3f (us)\n", wus);
printf("backward time: %12.3f (us)\n", b_us);
printf("total time: %12.3f (us)\n", f_us+b_us);

// printf("cnt = %lld\n", cnt);
// printf("forward time: %12.3f (us)\n", f_us);
// printf("->mf time: %12.3f (us)\n", mfus);
// printf("->rows time: %12.3f (us)\n", rus);
// printf("->kernel time: %12.3f (us)\n", kus);
// printf("->WB time: %12.3f (us)\n", wus);
// printf("backward time: %12.3f (us)\n", b_us);
// printf("total time: %12.3f (us)\n", f_us+b_us);
}






Binary file modified app/rodinia_3.1/cuda/hotspot3D/3D
Binary file not shown.

0 comments on commit b6ce613

Please sign in to comment.