-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrealtime_dada.cuh
285 lines (204 loc) · 10.1 KB
/
realtime_dada.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
#include <cuda_runtime.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sequence.h>
#include <thrust/system/cuda/execution_policy.h>
/* dada includes */
#include <algorithm>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <unistd.h>
#include <netdb.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <time.h>
#include "dada_client.h"
#include "dada_def.h"
#include "dada_hdu.h"
#include "multilog.h"
#include "ipcio.h"
#include "ipcbuf.h"
#include "dada_affinity.h"
#include "ascii_header.h"
#define IDX(t,f,n) (((t)*220*2048) + ((f)*220) + (n))
#define SZ 1024 //Size of the 2D FFT
#define N_ANTENNAS 10 //Number of Antennas (10)
#define N_BASELINES ((N_ANTENNAS - 1)*(N_ANTENNAS)/2) //Number of Baselines (45)
#define N_CORRELATION_PRODUCTS (N_ANTENNAS+N_BASELINES) //Number of Correlation Products (55)
#define N_STREAMS 5 //Number of Streams
#define N_TIMESTEPS_per_BLOCK 384 //How many timesteps in each block
#define N_BLOCKS_on_GPU 4 //Number of blocks on GPU
#define N_TIMESTEPS (N_TIMESTEPS_per_BLOCK * N_BLOCKS_on_GPU) //How much data to hold in GPU
#define N_DUMPPOINTS 1000 //How many points to identify in copy_if
// #define GPU 1 //Which GPU? 0-5
#define N_GPUS 5 //Total Number of GPUs
#define F_AVERAGING 2 //Number of channels averaged into each channel
#define ZERO_PT 300 //Start and end indexes are offset by 300 channels from absolute channel number
#define N_CHANNELS_per_GPU 250 //Number of physical (not averaged) channels on each GPU
// #define start_idx (GPU*N_CHANNELS_per_GPU) //Start index for GPU
// #define end_idx ((GPU+1)*N_CHANNELS_per_GPU) //Stop index for GPU
#define START_F 1.28 //Start frequency in GHz
#define END_F 1.53 //Stop frequency in GHz
#define TOT_CHANNELS 2048 //Total Number of channels
#define C_SPEED 299792458.0 //Speed of light
#define N_POL 2
#define N_COMPLEX 2
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
exit(code);
}
}
void gpuFFTchk(int errval){
if (errval != CUFFT_SUCCESS){
std::cerr << "Failed FFT call, error code " << errval << std::endl;
}
}
cudaEvent_t start;
cudaEvent_t stop;
#define START_TIMER() { \
gpuErrchk(cudaEventCreate(&start)); \
gpuErrchk(cudaEventCreate(&stop)); \
gpuErrchk(cudaEventRecord(start)); \
}
#define STOP_RECORD_TIMER(name) { \
gpuErrchk(cudaEventRecord(stop)); \
gpuErrchk(cudaEventSynchronize(stop)); \
gpuErrchk(cudaEventElapsedTime(&name, start, stop)); \
gpuErrchk(cudaEventDestroy(start)); \
gpuErrchk(cudaEventDestroy(stop)); \
}
/* Usage as defined by dada example code */
void usage()
{
fprintf (stdout,
"dsaX_imager [options]\n"
" -c core bind process to CPU core\n"
" -k key [default dada]\n"
" -h print usage\n");
}
/*cleanup as defined by dada example code */
void dsaX_dbgpu_cleanup (dada_hdu_t * in, multilog_t * log) {
if (dada_hdu_unlock_read (in) < 0){
multilog(log, LOG_ERR, "could not unlock read on hdu_in\n");
}
dada_hdu_destroy (in);
}
__device__
void cx_mult (cufftComplex &a, cufftComplex b){
cufftComplex temp;
temp.x = a.x*b.x-a.y*b.y;
temp.y = a.x*b.y + a.y*b.x;
a.x = temp.x; a.y = temp.y;
}
template<typename T>
struct greater_than_val : public thrust::unary_function<T, bool> {
T val;
greater_than_val(T val_) : val(val_) {}
inline __host__ __device__
bool operator()(T x) const {
return x.x > val.x;
}
};
using thrust::make_zip_iterator;
using thrust::make_tuple;
using thrust::make_counting_iterator;
__global__
void grid(cufftComplex* A, float* data, int time_step, int gpu, float DM){
int start_idx = ((gpu)*N_CHANNELS_per_GPU);
int end_idx = ((gpu+1)*N_CHANNELS_per_GPU);
int baseline_number = blockIdx.x;
// int tid = threadIdx.x;
// int baseline_number = blockDim.x * blockIdx.x + threadIdx.x; // since baseline is fastest index in data
// enables coalesced memory accesses
/*
Convert from Triangular linear index to (i,j) matrix indicies
Derived using polynomial fits
*/
// int i = floor((19.0 - sqrt(361.0 - 8.0*baseline_number))/2.0); // equvalent to sm
// int j = baseline_number - static_cast<int>(-i*i/2.0 + 8.5*i - 1); // larger number
int j = floor((1.0 + sqrt(8.0*baseline_number + 1.0))/2.0);
int i = baseline_number - (j * j - j)/2;
int baseline_idx = baseline_number + j; //How to get index for data matrix from baseline number
//Shifting triangular index up by 1
// printf("(i,j) = (%d, %d), baseline_number = %d, baseline_idx = %d\n", i,j,baseline_number, baseline_idx);
/* Antenna Locations, Generated by AntennaLocations.py */
float pos[N_ANTENNAS][3] = {{ 2.32700165e-05, 4.43329978e+02, 1.02765904e-05},
{ 2.21098953e-05, 4.36579978e+02, 9.64180626e-06},
{ 1.14039406e-05, 3.42179978e+02, 4.94604756e-06},
{-1.14039406e-05, 1.53679978e+02, -4.94604756e-06},
{ 1.90319971e+02, -4.48715200e-05, 1.72737512e-05},
{ 1.83569971e+02, -4.37387873e-05, 1.57832354e-05},
{-1.03180029e+02, -1.03154892e-05, -2.81970909e-05},
{-1.76180029e+02, -1.63615256e-06, -3.96178686e-05},
{-2.03330030e+02, 1.63615256e-06, -4.39237587e-05},
{-9.91080054e+02, -2.16759905e+02, -4.00017642e+00}};
// int ant_order[] = {1, 4, 5, 8, 6, 9, 2, 10, 3, 7};
int ant_order[] = {3, 7, 2, 10, 1, 4, 5, 8, 6, 9};
float pos_extent = 1299.54002797;
float wavelength;
int x_pos, y_pos;
// to be passed as arguments later
// int idx_usable_start = 300;// the start of the "usable" frequencies is the 300th total channel
// int idx_usable_end = 1800;
// float f_tot_start = 1.23;
// float f_tot_end = 1.58;
//int n_chunk_idx = idx_chunk_end - idx_chunk_start; // number of chunk frequency indicies
// int n_usable_idx = idx_usable_end - idx_usable_start; // number of usable frequency indicies
// int n_tot_idx = 2048; // number of total frequency indicies
int n_channels_in_data = (1.0*N_CHANNELS_per_GPU)/F_AVERAGING; // Number of channels present in data
int n_floats_per_baseline = N_POL*N_COMPLEX*n_channels_in_data; // Number of floats between baseline indicies
float bw_per_channel = (START_F - END_F)/static_cast<float>(TOT_CHANNELS); // Bandwidth in Each channel
float time_per_step = 1.048576; // ms
float f_low = (END_F - (ZERO_PT + end_idx) * bw_per_channel); // Lowest Frequency in dataset
float time_delay_low_freq = 4.148*DM/(f_low*f_low); // DM time delay for lowest frequency (max delay)
int px_per_res_elem = 3;
float UV_to_index = ((float) SZ)/(1.0*pos_extent)* C_SPEED/(END_F*1E9)/px_per_res_elem; //convert U,V to index
cufftComplex A_POL_DATUM, B_POL_DATUM; // Data variables for real, imag
float f_ghz; // Frequency of analysis
float avg_mid_pt = (F_AVERAGING - 1.0)/2.0; // Frequency Center of averaged band
int DM_offset_index, DM_offset_time_steps;
int f = threadIdx.x; //index of the frequency which this thread is analyzing
//float f_hig = (END_F - (ZERO_PT + start_idx) * bw_per_channel); // chunk start <=> high freq
// int n_floats_per_freq = (N_BASELINES + N_ANTENNAS)*n_pol*n_complex; // number of floats per frequency (220)
while (f < n_channels_in_data){ //we have data for N_CHANNELS_per_GPU/F_AVERAGING frequencies
/* Calculate Current Frequency from f, the frequency index,
Frequencies are calculated backwards from the highest frequency*/
f_ghz = (END_F - static_cast<float>(f*F_AVERAGING + avg_mid_pt + start_idx + ZERO_PT)*bw_per_channel);
/* Calculate Dispersion delay between current frequency, lowest frequency */
DM_offset_time_steps = floor( (time_delay_low_freq - 4.148*DM/(f_ghz*f_ghz)) / time_per_step);
/* Calculate the index offset */
DM_offset_index = ((time_step - DM_offset_time_steps)%N_TIMESTEPS) * n_floats_per_baseline * (N_BASELINES + N_ANTENNAS);
wavelength = C_SPEED/(1E9*f_ghz);
// load the A pol data into memory
A_POL_DATUM.x = data[DM_offset_index + (baseline_idx * n_floats_per_baseline) + (f*N_POL*N_COMPLEX) ];
A_POL_DATUM.y = data[DM_offset_index + (baseline_idx * n_floats_per_baseline) + (f*N_POL*N_COMPLEX) + 1];
// load the B pol data into memory
B_POL_DATUM.x = data[DM_offset_index + (baseline_idx * n_floats_per_baseline) + (f*N_POL*N_COMPLEX) + 2];
B_POL_DATUM.y = data[DM_offset_index + (baseline_idx * n_floats_per_baseline) + (f*N_POL*N_COMPLEX) + 3];
// correct each baseline
// cx_mult(A_POL_DATUM, A_cal[baseline_number*n_usable_idx + f]);
// cx_mult(B_POL_DATUM, B_cal[baseline_number*n_usable_idx + f]);
// printf("baseline_number*n_usable_idx + f = %d => %f + %fj vs. %f + %fj\n", baseline_number*n_usable_idx + f, A_POL_DATUM.x, A_POL_DATUM.y, tempc.x, tempc.y);
// printf("(%d, %d) => %f + %fj vs. %f + %fj\n", i, j , A_POL_DATUM.x, A_POL_DATUM.y, tempc.x, tempc.y);
//atomic add to baseline 1
x_pos = static_cast<int>(floor(UV_to_index*(pos[ant_order[i]-1][0]-pos[ant_order[j]-1][0])/wavelength)) % SZ;
y_pos = static_cast<int>(floor(UV_to_index*(pos[ant_order[i]-1][1]-pos[ant_order[j]-1][1])/wavelength)) % SZ;
atomicAdd(&(A[x_pos*SZ+y_pos].x), A_POL_DATUM.x + B_POL_DATUM.x);
atomicAdd(&(A[x_pos*SZ+y_pos].y), -A_POL_DATUM.y - B_POL_DATUM.y);
//atomic add to baseline 2
x_pos = static_cast<int>(floor(UV_to_index*(pos[ant_order[j]-1][0]-pos[ant_order[i]-1][0])/wavelength)) % SZ;
y_pos = static_cast<int>(floor(UV_to_index*(pos[ant_order[j]-1][1]-pos[ant_order[i]-1][1])/wavelength)) % SZ;
atomicAdd(&(A[x_pos*SZ+y_pos].x), A_POL_DATUM.x + B_POL_DATUM.x);
atomicAdd(&(A[x_pos*SZ+y_pos].y), A_POL_DATUM.y + B_POL_DATUM.y);
f += blockDim.x;
}
}