-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfloyd_sm_row.cu
104 lines (94 loc) · 2.95 KB
/
floyd_sm_row.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes CUDA
#include <cuda_runtime.h>
extern "C" {
#include "MatUtil.h"
}
__device__
int Min(int a, int b) { return a < b ? a : b; }
__global__
void SharedMemoryFloydWarshall(int* mat, int k, int N) {
__shared__ int dist_i_k;
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if (i < N && j < N) {
int dist_i_j = mat[i*N + j];
int dist_k_j = mat[k*N + j];
if (threadIdx.y == 0) {
dist_i_k = mat[i*N + k];
}
__syncthreads();
if (dist_i_k != -1 && dist_k_j != -1) {
int new_dist = dist_i_k + dist_k_j;
if (dist_i_j != -1) {
new_dist = Min(new_dist, dist_i_j);
}
mat[i*N + j] = new_dist;
}
}
}
void SharedMemoryFloydWarshallDriver(int* mat, int N, dim3 thread_per_block) {
int* cuda_mat;
int size = sizeof(int) * N * N;
cudaMalloc((void**) &cuda_mat, size);
cudaMemcpy(cuda_mat, mat, size, cudaMemcpyHostToDevice);
dim3 num_block(ceil(1.0*N/thread_per_block.x),
ceil(1.0*N/thread_per_block.y));
for (int k = 0; k < N; ++k) {
SharedMemoryFloydWarshall<<<num_block, thread_per_block>>>(cuda_mat, k, N);
}
cudaMemcpy(mat, cuda_mat, size, cudaMemcpyDeviceToHost);
cudaFree(cuda_mat);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
if(argc != 4) {
printf("Usage: test {N} {run_sequential_check: 'T' or 'F'} {segment_size}\n");
exit(-1);
}
char run_sequential_check = argv[2][0];
int segment_size = atoi(argv[3]);
dim3 thread_per_block(1, segment_size);
//generate a random matrix.
size_t N = atoi(argv[1]);
int *mat = (int*)malloc(sizeof(int)*N*N);
GenMatrix(mat, N);
//compute your results
int *result = (int*)malloc(sizeof(int)*N*N);
memcpy(result, mat, sizeof(int)*N*N);
//replace by parallel algorithm
SharedMemoryFloydWarshallDriver(result, N, thread_per_block);
//compare your result with reference result
if (run_sequential_check == 'T') {
int *ref = (int*)malloc(sizeof(int)*N*N);
memcpy(ref, mat, sizeof(int)*N*N);
ST_APSP(ref, N);
if(CmpArray(result, ref, N*N))
printf("Your result is correct.\n");
else
printf("Your result is wrong.\n");
#ifdef PRINT_MATRIX
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
printf("%d ", ref[i*N+j]);
}
printf("\n");
}
#endif
}
#ifdef PRINT_MATRIX
printf("==RESULT==\n");
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
printf("%d ", result[i*N+j]);
}
printf("\n");
}
#endif
}