forked from WarwickDumas/FocusFusion2D
-
Notifications
You must be signed in to change notification settings - Fork 0
/
newkernel2.cu
6828 lines (5843 loc) · 244 KB
/
newkernel2.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Version 0.51
// Been over some attempts to ameliorate local accesses -- not v successful basically.
// Correction in "Get Lap phi" routine.
// Version 0.52:
// Change Lap A, Grad A routines to load CHAR4 p_tri_per_neigh instead of loading data
// to interrogate neighbour periodic status.
// Change major area calc in the INNERMOST/OUTERMOST case.
// Note that central area calc does not look right.
// Version 0.53:
// Made changes to Reladvect_nT because it was taking wrong connection for OUTERMOST.
// Changed flag tests & treatment of Inner verts in preceding routines.
// Version 0.54:
// Adjusted area calculations as written in spec.
// We set ins crossing tri minor area = 0, centroid on ins;
// frill area = 0, centroid on boundary.
// Version 0.6:
// Debugging and making corrections.
// ==
// Version 0.7:
// Debugging ... there is a kernel launch failure for Antiadvect Adot
// Version 0.8:
// Change to set phi at innermost and outermost instead of trying to handle
// insulator special conditions.
// Am I sure? yes.
// PLAN:
// Allow that on GPU we can move outside domain and it's fine, we do not change PB data.
// PB data will be only changed on CPU.
// Nonetheless we kept PBCTri lists which can be updated, unlike has_periodic alone, in case
// of moving something to its image within the domain.
// NOTES:
// Ensure that outside the domain, n_major is recorded as 0
// Ensure that outside the domain, resistive_heat is recorded as 0
extern real FRILL_CENTROID_OUTER_RADIUS, FRILL_CENTROID_INNER_RADIUS;
__global__ void Kernel_CalculateTriMinorAreas_AndCentroids
(structural * __restrict__ p_info_sharing, // for vertex positions
LONG3 * __restrict__ p_tri_corner_index,
CHAR4 * __restrict__ p_tri_perinfo,
// Output:
f64 * __restrict__ p_area_minor,
f64_vec2 * __restrict__ p_tri_centroid)
{
__shared__ f64_vec2 shared_vertex_pos[SIZE_OF_MAJOR_PER_TRI_TILE];
long StartMajor = blockIdx.x*SIZE_OF_MAJOR_PER_TRI_TILE;
long EndMajor = StartMajor + SIZE_OF_MAJOR_PER_TRI_TILE;
long tid = threadIdx.x + blockIdx.x * blockDim.x;
// Note that we only do a fetch with the first half of threads:
if (threadIdx.x < SIZE_OF_MAJOR_PER_TRI_TILE)
{
structural info = p_info_sharing[blockIdx.x*SIZE_OF_MAJOR_PER_TRI_TILE + threadIdx.x];
// Well here is a major problem.
// mti.StartMajor is a separate value for every thread.
// How can we make it do a contiguous access?
// Suppose a 1:1 correspondence between minor blocks and major blocks...
// that is ONE way.
shared_vertex_pos[threadIdx.x] = info.pos;
// shared_shorts[threadIdx.x].flag = info.flag;
// shared_shorts[threadIdx.x].neigh_len = info.neigh_len;
// these were never used.
};
// If we make an extended array then we can always go through that code.
__syncthreads();
// Triangle area * 2/3 is area of minor cell.
// if (tid < Ntris) { // redundant test if we do it right
LONG3 corner_index = p_tri_corner_index[tid];
CHAR4 perinfo = p_tri_perinfo[tid];
// Do we ever require those and not the neighbours?
// Yes - this time for instance.
f64_vec2 pos1, pos2, pos3;
if ((corner_index.i1 >= StartMajor) && (corner_index.i1 < EndMajor))
{
pos1 = shared_vertex_pos[corner_index.i1-StartMajor];
} else {
// have to load in from global memory:
structural info = p_info_sharing[corner_index.i1];
pos1 = info.pos;
}
if ((corner_index.i2 >= StartMajor) && (corner_index.i2 < EndMajor))
{
pos2 = shared_vertex_pos[corner_index.i2-StartMajor];
} else {
// have to load in from global memory:
structural info = p_info_sharing[corner_index.i2];
pos2 = info.pos;
}
if ((corner_index.i3 >= StartMajor) && (corner_index.i3 < EndMajor))
{
pos3 = shared_vertex_pos[corner_index.i3-StartMajor];
} else {
// have to load in from global memory:
structural info = p_info_sharing[corner_index.i3];
pos3 = info.pos;
}
if (perinfo.per0+perinfo.per1+perinfo.per2 == 0) {
} else {
// In this case which ones are periodic?
// Should we just store per flags?
// How it should work:
// CHAR4 perinfo: periodic, per0, per1, per2;
if (perinfo.per0 == NEEDS_ANTI)
pos1 = Anticlock_rotate2(pos1);
if (perinfo.per0 == NEEDS_CLOCK)
pos1 = Clockwise_rotate2(pos1);
if (perinfo.per1 == NEEDS_ANTI)
pos2 = Anticlock_rotate2(pos2);
if (perinfo.per1 == NEEDS_CLOCK)
pos2 = Clockwise_rotate2(pos2);
if (perinfo.per2 == NEEDS_ANTI)
pos3 = Anticlock_rotate2(pos3);
if (perinfo.per2 == NEEDS_CLOCK)
pos3 = Clockwise_rotate2(pos3);
};
// Now we've got to decide what to do about minor cells near the edges.
// Edge of memory: triangles should not continue to the edge.
// Ultimately the edge of memory will be mostly within cathode rods and suchlike things.
// So we don't need to connect tri mesh beyond outermost row of vertices, even if we could.
// Realise that this edge cell crosses into the insulator and so should be assigned nv_r = 0
// We do not know what order the corners are given in.
// So use fabs:
f64 area = fabs(0.5*( (pos2.x+pos1.x)*(pos2.y-pos1.y)
+ (pos3.x+pos2.x)*(pos3.y-pos2.y)
+ (pos1.x+pos3.x)*(pos1.y-pos3.y)
) );
f64_vec2 centroid = THIRD*(pos1+pos2+pos3);
if (area > 1.0e-3) {
printf("tri %d area %1.3E pos_x %1.6E %1.6E %1.6E \n"
" pos_y %1.6E %1.6E %1.6E \n",tid,area,
pos1.x,pos2.x,pos3.x,
pos1.y,pos2.y,pos3.y);
}
if (perinfo.flag == OUTER_FRILL)
{
f64_vec2 temp = 0.5*(pos1+pos2);
temp.project_to_radius(centroid, FRILL_CENTROID_OUTER_RADIUS_d);
area = 1.0e-14; // == 0 but tiny is less likely to cause 1/0
}
if (perinfo.flag == INNER_FRILL)
{
f64_vec2 temp = 0.5*(pos1+pos2);
temp.project_to_radius(centroid, FRILL_CENTROID_INNER_RADIUS_d);
area = 1.0e-14; // == 0 but tiny is less likely to cause 1/0
}
if (perinfo.flag == CROSSING_INS) {
f64_vec2 centroid2;
centroid.project_to_ins(centroid2);
centroid = centroid2;
// The major cells will abut the insulator.
// Only count the % of the area that is in the domain.
//bool b1, b2, b3;
//b1 = (pos1.x*pos1.x+pos1.y*pos1.y > INSULATOR_OUTER_RADIUS*INSULATOR_OUTER_RADIUS);
//b2 = (pos2.x*pos2.x+pos2.y*pos2.y > INSULATOR_OUTER_RADIUS*INSULATOR_OUTER_RADIUS);
//b3 = (pos3.x*pos3.x+pos3.y*pos3.y > INSULATOR_OUTER_RADIUS*INSULATOR_OUTER_RADIUS);
// Save ourselves some bother for now by setting area to be near 0.
// area = 1.0e-14;
// FOR NOW, legislate v = 0 in insulator-crossing tris.
// And so avoid having to do an awkward area calculation.
// Stick with correct area for tri as area variable.
// Possibly we never use 'area' except for domain-related matters; if that can be
// verified, then it's best to change to 'domain_intersection_area', however tricky.
}
p_tri_centroid[tid] = centroid;
p_area_minor[tid] = 0.666666666666667*area;
if (p_area_minor[tid] < 0.0) {
printf("kernel -- tid %d flag %d area %1.8E \n",tid,perinfo.flag,area);
};
// Perhaps we need instead to read data from neighbours to create tri minor area.
// Note that we subsequently CHANGED the nodes of minor mesh to be at averages
// so that we could average neatly A to edges. However, this means TWOTHIRDS*tri area
// is not an exact estimate.
}
// FOR OUTERMOST,
//
// | 4 \/ 3 |
// pt0| ------- |pt3
// 0 2
// pt1| 1 |pt2
// If we have an outer point,
// then the number of neighs is not the number of tris;
// SO EXPLOIT THIS
// Make sure that the omitted edge is the one that would go between the frill tris.
// This has to go into the reconstructing code that will
// generate the mesh with frill tris.
// ---------------------------------------------------------------------------------
// We'll want to calculate areas for triangles AND for central cells.
// But they require different codes so might as well be different kernels.
// Central area = sum of 1/6 of each neighbouring tri minor area.
// So now let's write central area calc routine:
// We should be passed the pointer to the start of the central minor area array.
__global__ void Kernel_CalculateCentralMinorAreas (
structural * __restrict__ p_info_sharing,
long * __restrict__ p_IndexTri,
f64 * __restrict__ p_triminor_area,
// Output:
f64 * __restrict__ p_area_minor
// pass output array starting from the central array start
)
{
__shared__ f64 shared_area[SIZE_OF_TRI_TILE_FOR_MAJOR];
__shared__ long Indextri[MAXNEIGH_d*threadsPerTileMajor];
// 2*8+12*4 = 64 bytes => room for 768 threads in 48K
long index = threadIdx.x + blockIdx.x * blockDim.x;
// Load in minor data: how to manage this? fill in with 2 strides; rely on contiguity.
long StartMinor = blockIdx.x*SIZE_OF_TRI_TILE_FOR_MAJOR; // Have to do this way.
// will this be recognised as contiguous access?
shared_area[threadIdx.x] =
p_triminor_area[blockIdx.x*SIZE_OF_TRI_TILE_FOR_MAJOR + threadIdx.x];
shared_area[blockDim.x + threadIdx.x] =
p_triminor_area[blockIdx.x*SIZE_OF_TRI_TILE_FOR_MAJOR + blockDim.x + threadIdx.x];
// Loaded in 2 times as many areas as central cells
__syncthreads();
//if (index < Nverts)
{
structural info = p_info_sharing[index];
memcpy(Indextri + MAXNEIGH_d*threadIdx.x,
p_IndexTri + MAXNEIGH_d*index,
MAXNEIGH_d*sizeof(long)); // MAXNEIGH_d should be chosen to be 12, for 1 full bus.
f64 sum = 0.0;
#pragma unroll 12
for (short iNeigh = 0; iNeigh < info.neigh_len; iNeigh++)
{
long indextri = Indextri[MAXNEIGH_d*threadIdx.x + iNeigh];
if ((indextri < StartMinor) || (indextri >= StartMinor+SIZE_OF_TRI_TILE_FOR_MAJOR))
{
sum += p_triminor_area[indextri];
} else {
sum += shared_area[indextri-StartMinor];
}
}
// place separation of central from edge cell at 1/3 along line.
// Then have 1/9 area in central shard, 2/3 in edge minor,
// so (1/9)/(2/3) = 1/6
p_area_minor[index] = sum*SIXTH;
if (sum < 0.0) {
printf("kerncentral -- tid %d area %1.2E \n",index,p_area_minor[index]);
};
};
// This may give funny results at the edges of memory, where we have added
// areas only of shards that formed part of triangles. But that is the expected
// behaviour here.
// If we had frills with repeated corners, then we get area 0 from them.
// If we used a special vertex then we had to do something special in setting area - perhaps we want it to = 0.
// BUG:
// This 1/6 only holds as long as we position the minor joins on the lines
// between vertices. If we use (1/3)(vertex + centroid 1 + centroid 2)
// then we should not be doing this area sum. Rather, given each pair of
// triangles, we can infer the area of the triangle that is part of central cell.
}
__global__ void Kernel_CalculateMajorAreas (
structural * __restrict__ p_info,
f64_vec2 * __restrict__ p_tri_centroid,
long * __restrict__ pIndexTri,
char * __restrict__ pPBCtri,
// Output:
f64 * __restrict__ p_area
)
{
__shared__ f64_vec2 shared_centroids[SIZE_OF_TRI_TILE_FOR_MAJOR];
__shared__ long Indextri[MAXNEIGH_d*threadsPerTileMajor];
__shared__ char PBCtri[MAXNEIGH_d*threadsPerTileMajor];
long StartMinor = blockIdx.x*SIZE_OF_TRI_TILE_FOR_MAJOR; // Have to do this way.
shared_centroids[threadIdx.x] =
p_tri_centroid[blockIdx.x*SIZE_OF_TRI_TILE_FOR_MAJOR + threadIdx.x];
shared_centroids[blockDim.x + threadIdx.x] =
p_tri_centroid[blockIdx.x*SIZE_OF_TRI_TILE_FOR_MAJOR + blockDim.x + threadIdx.x];
//
//if (shared_centroids[threadIdx.x].x*shared_centroids[threadIdx.x].x
// + shared_centroids[threadIdx.x].y*shared_centroids[threadIdx.x].y > DOMAIN_OUTER_RADIUS*DOMAIN_OUTER_RADIUS)
// shared_centroids[threadIdx.x].project_to_radius(shared_centroids[threadIdx.x],DOMAIN_OUTER_RADIUS);
//
//if (shared_centroids[threadIdx.x].x*shared_centroids[threadIdx.x].x
// + shared_centroids[threadIdx.x].y*shared_centroids[threadIdx.x].y < INNER_A_BOUNDARY*INNER_A_BOUNDARY)
// shared_centroids[threadIdx.x].project_to_radius(shared_centroids[threadIdx.x],INNER_A_BOUNDARY);
//
//if (shared_centroids[blockDim.x + threadIdx.x].x*shared_centroids[blockDim.x + threadIdx.x].x
// + shared_centroids[blockDim.x + threadIdx.x].y*shared_centroids[blockDim.x + threadIdx.x].y > DOMAIN_OUTER_RADIUS*DOMAIN_OUTER_RADIUS)
// shared_centroids[blockDim.x + threadIdx.x].project_to_radius(shared_centroids[blockDim.x + threadIdx.x],DOMAIN_OUTER_RADIUS);
//
//if (shared_centroids[blockDim.x + threadIdx.x].x*shared_centroids[blockDim.x + threadIdx.x].x
// + shared_centroids[blockDim.x + threadIdx.x].y*shared_centroids[blockDim.x + threadIdx.x].y < INNER_A_BOUNDARY*INNER_A_BOUNDARY)
// shared_centroids[blockDim.x + threadIdx.x].project_to_radius(shared_centroids[blockDim.x + threadIdx.x],INNER_A_BOUNDARY);
//
__syncthreads();
long index = threadIdx.x + blockIdx.x * blockDim.x;
f64_vec2 uprev, unext;
//if (index < Nverts) { // redundant test, should be
structural info = p_info[index];
memcpy(Indextri + MAXNEIGH_d*threadIdx.x,
pIndexTri + MAXNEIGH_d*index,
MAXNEIGH_d*sizeof(long)); // MAXNEIGH_d should be chosen to be 12, for 1 full bus.
memcpy(PBCtri + MAXNEIGH_d*threadIdx.x,
pPBCtri + MAXNEIGH_d*index,
MAXNEIGH_d*sizeof(char)); // MAXNEIGH_d should be chosen to be 12, for 1 full bus.
f64 grad_x_integrated_x = 0.0;
// Going to do shoelace on tri centroids which must be sorted anticlockwise.
// If we have a frilled e.g.OUTERMOST vertex, we shall find that
// info.neigh_len = 4 whereas tri_len = 5. Bear in mind...
if ((info.flag != OUTERMOST) && (info.flag != INNERMOST))
{
long indextri = Indextri[MAXNEIGH_d*threadIdx.x + info.neigh_len-1];
if ((indextri >= StartMinor) && (indextri < StartMinor + SIZE_OF_TRI_TILE_FOR_MAJOR)) {
uprev = shared_centroids[indextri-StartMinor];
} else {
uprev = p_tri_centroid[indextri];
}
char PBC = PBCtri[threadIdx.x*MAXNEIGH_d + info.neigh_len-1];
if (PBC == NEEDS_CLOCK) {
uprev = Clockwise_rotate2(uprev);
}
if (PBC == NEEDS_ANTI) {
uprev = Anticlock_rotate2(uprev);
}
short iNeigh;
for (iNeigh = 0; iNeigh < info.neigh_len; iNeigh++) // iNeigh is the anticlockwise one
{
indextri = Indextri[MAXNEIGH_d*threadIdx.x + iNeigh];
if ((indextri >= StartMinor) && (indextri < StartMinor + SIZE_OF_TRI_TILE_FOR_MAJOR)) {
unext = shared_centroids[indextri-StartMinor];
} else {
unext = p_tri_centroid[indextri];
}
char PBC = PBCtri[threadIdx.x*MAXNEIGH_d + iNeigh];
if (PBC == NEEDS_CLOCK) {
unext = Clockwise_rotate2(unext);
}
if (PBC == NEEDS_ANTI) {
unext = Anticlock_rotate2(unext);
}
// Get edge_normal.x and average x on edge
grad_x_integrated_x += //0.5*(unext.x+uprev.x)*edge_normal.x
0.5*(unext.x+uprev.x)*(unext.y-uprev.y);
uprev = unext;
};
} else {
// FOR THE OUTERMOST / INNERMOST CELLS :
// In this case we basically substituted tri_len for neigh_len:
// Also we project frill centroid on to the inner/outer radius.
long indextri = Indextri[MAXNEIGH_d*threadIdx.x + info.neigh_len];
if ((indextri >= StartMinor) && (indextri < StartMinor + SIZE_OF_TRI_TILE_FOR_MAJOR)) {
uprev = shared_centroids[indextri-StartMinor];
} else {
uprev = p_tri_centroid[indextri];
}
if (uprev.x*uprev.x + uprev.y*uprev.y > DOMAIN_OUTER_RADIUS*DOMAIN_OUTER_RADIUS)
uprev.project_to_radius(uprev,DOMAIN_OUTER_RADIUS);
if (uprev.x*uprev.x + uprev.y*uprev.y < INNER_A_BOUNDARY*INNER_A_BOUNDARY)
uprev.project_to_radius(uprev,INNER_A_BOUNDARY);
char PBC = PBCtri[threadIdx.x*MAXNEIGH_d + info.neigh_len];
if (PBC == NEEDS_CLOCK) uprev = Clockwise_rotate2(uprev);
if (PBC == NEEDS_ANTI) uprev = Anticlock_rotate2(uprev);
short iNeigh;
for (iNeigh = 0; iNeigh < info.neigh_len+1; iNeigh++) // iNeigh is the anticlockwise one
{
indextri = Indextri[MAXNEIGH_d*threadIdx.x + iNeigh];
if ((indextri >= StartMinor) && (indextri < StartMinor + SIZE_OF_TRI_TILE_FOR_MAJOR)) {
unext = shared_centroids[indextri-StartMinor];
} else {
unext = p_tri_centroid[indextri];
}
if (unext.x*unext.x + unext.y*unext.y > DOMAIN_OUTER_RADIUS*DOMAIN_OUTER_RADIUS)
unext.project_to_radius(unext,DOMAIN_OUTER_RADIUS);
if (unext.x*unext.x + unext.y*unext.y < INNER_A_BOUNDARY*INNER_A_BOUNDARY)
unext.project_to_radius(unext,INNER_A_BOUNDARY);
char PBC = PBCtri[threadIdx.x*MAXNEIGH_d + iNeigh];
if (PBC == NEEDS_CLOCK) unext = Clockwise_rotate2(unext);
if (PBC == NEEDS_ANTI) unext = Anticlock_rotate2(unext);
grad_x_integrated_x += 0.5*(unext.x+uprev.x)*(unext.y-uprev.y);
// We do have to even count the edge looking into frills, or polygon
// area would not be right.
uprev = unext;
};
};
p_area[index] = grad_x_integrated_x;
/*if ((index == 36685)) {
printf("index %d flag %d area %1.3E \n",
index, info.flag, grad_x_integrated_x);
long indextri = Indextri[MAXNEIGH_d*threadIdx.x + info.neigh_len-1];
if ((indextri >= StartMinor) && (indextri < StartMinor + SIZE_OF_TRI_TILE_FOR_MAJOR)) {
uprev = shared_centroids[indextri-StartMinor];
} else {
uprev = p_tri_centroid[indextri];
}
char PBC = PBCtri[threadIdx.x*MAXNEIGH_d + info.neigh_len-1];
if (PBC == NEEDS_CLOCK) {
uprev = Clockwise_rotate2(uprev);
}
if (PBC == NEEDS_ANTI) {
uprev = Anticlock_rotate2(uprev);
}
//printf("uprev %1.5E %1.5E ... %1.5E\n",uprev.x,uprev.y,uprev.modulus());
short iNeigh;
for (iNeigh = 0; iNeigh < info.neigh_len; iNeigh++) // iNeigh is the anticlockwise one
{
indextri = Indextri[MAXNEIGH_d*threadIdx.x + iNeigh];
if ((indextri >= StartMinor) && (indextri < StartMinor + SIZE_OF_TRI_TILE_FOR_MAJOR)) {
unext = shared_centroids[indextri-StartMinor];
} else {
unext = p_tri_centroid[indextri];
}
char PBC = PBCtri[threadIdx.x*MAXNEIGH_d + iNeigh];
if (PBC == NEEDS_CLOCK) {
unext = Clockwise_rotate2(unext);
}
if (PBC == NEEDS_ANTI) {
unext = Anticlock_rotate2(unext);
}
// printf("unext %1.5E %1.5E ... %1.5E \n",unext.x,unext.y,unext.modulus());
// Get edge_normal.x and average x on edge
grad_x_integrated_x += //0.5*(unext.x+uprev.x)*edge_normal.x
0.5*(unext.x+uprev.x)*(unext.y-uprev.y);
uprev = unext;
};
};*/
}
__global__ void Kernel_Average_nT_to_tri_minors (
LONG3 * __restrict__ p_tri_corner_index,
CHAR4 * __restrict__ p_tri_perinfo,
nT * __restrict__ p_nT_neut,
nT * __restrict__ p_nT_ion,
nT * __restrict__ p_nT_elec,
// Output:
nT * __restrict__ p_minor_nT_neut,
nT * __restrict__ p_minor_nT_ion,
nT * __restrict__ p_minor_nT_elec
)
{
// Average by area so that we get the same total mass on minor mesh as on major.
// We have to know intersection. It's not always 1/3 of triangle is it.
// ??
// Even corner positions do not tell us intersection. We'd have to know the neighbouring
// centroid also.
__shared__ nT shared_nT[SIZE_OF_MAJOR_PER_TRI_TILE];
if (threadIdx.x < SIZE_OF_MAJOR_PER_TRI_TILE)
{
shared_nT[threadIdx.x] = p_nT_neut[blockIdx.x*SIZE_OF_MAJOR_PER_TRI_TILE + threadIdx.x];
}
__syncthreads();
long StartMajor = blockIdx.x*SIZE_OF_MAJOR_PER_TRI_TILE;
long EndMajor = StartMajor + SIZE_OF_MAJOR_PER_TRI_TILE;
long tid = threadIdx.x + blockIdx.x * blockDim.x;
nT nT1, nT2, nT3, nT_out;
LONG3 corner_index;
CHAR4 per_info = p_tri_perinfo[tid];
corner_index = p_tri_corner_index[tid];
// Do we ever require those and not the neighbours? Yes - this time for instance.
if ((corner_index.i1 >= StartMajor) && (corner_index.i1 < EndMajor))
{
nT1 = shared_nT[corner_index.i1-StartMajor];
} else {
// have to load in from global memory:
nT1 = p_nT_neut[corner_index.i1];
}
if ((corner_index.i2 >= StartMajor) && (corner_index.i2 < EndMajor))
{
nT2 = shared_nT[corner_index.i2-StartMajor];
} else {
nT2 = p_nT_neut[corner_index.i2];
}
if ((corner_index.i3 >= StartMajor) && (corner_index.i3 < EndMajor))
{
nT3 = shared_nT[corner_index.i3-StartMajor];
} else {
nT3 = p_nT_neut[corner_index.i3];
}
if (per_info.flag == CROSSING_INS) {
// An idea: Ensure that outside the domain, n is recorded as 0
int divide = 0.0;
nT_out.n = 0.0;
nT_out.T = 0.0;
if (nT1.n > 0.0) {
nT_out.n += nT1.n;
nT_out.T += nT1.T;
divide++;
}
if (nT2.n > 0.0) {
nT_out.n += nT2.n;
nT_out.T += nT2.T;
divide++;
}
if (nT3.n > 0.0) {
nT_out.n += nT3.n;
nT_out.T += nT3.T;
divide++;
}
nT_out.n /= (real)divide;
nT_out.T /= (real)divide;
} else {
nT_out.n = THIRD*(nT1.n+nT2.n+nT3.n);
nT_out.T = THIRD*(nT1.T+nT2.T+nT3.T);
};
// SO THIS IS JUST ROUGH FOR NOW? What we wanted to do:
// Sum (Area_intersection * nT) / Sum(Area_intersection)
// You cannot get the intersection area just from knowing the corner positions.
// But do note that since centroid = (1/3)(sum of positions), (1/3) represents linear interpolation on a plane.
p_minor_nT_neut[tid] = nT_out;
__syncthreads();
// Now repeat same thing for each species
if (threadIdx.x < SIZE_OF_MAJOR_PER_TRI_TILE)
{
shared_nT[threadIdx.x] = p_nT_ion[blockIdx.x*SIZE_OF_MAJOR_PER_TRI_TILE + threadIdx.x];
}
__syncthreads();
//if (tid < Ntris) {
if ((corner_index.i1 >= StartMajor) && (corner_index.i1 < EndMajor))
{
nT1 = shared_nT[corner_index.i1-StartMajor];
} else {
// have to load in from global memory:
nT1 = p_nT_ion[corner_index.i1];
}
if ((corner_index.i2 >= StartMajor) && (corner_index.i2 < EndMajor))
{
nT2 = shared_nT[corner_index.i2-StartMajor];
} else {
nT2 = p_nT_ion[corner_index.i2];
}
if ((corner_index.i3 >= StartMajor) && (corner_index.i3 < EndMajor))
{
nT3 = shared_nT[corner_index.i3-StartMajor];
} else {
nT3 = p_nT_ion[corner_index.i3];
}
if (per_info.flag == CROSSING_INS) {
// An idea: Ensure that outside the domain, n is recorded as 0
int divide = 0.0;
nT_out.n = 0.0;
nT_out.T = 0.0;
if (nT1.n > 0.0) {
nT_out.n += nT1.n;
nT_out.T += nT1.T;
divide++;
}
if (nT2.n > 0.0) {
nT_out.n += nT2.n;
nT_out.T += nT2.T;
divide++;
}
if (nT3.n > 0.0) {
nT_out.n += nT3.n;
nT_out.T += nT3.T;
divide++;
}
nT_out.n /= (real)divide;
nT_out.T /= (real)divide;
} else {
nT_out.n = THIRD*(nT1.n+nT2.n+nT3.n);
nT_out.T = THIRD*(nT1.T+nT2.T+nT3.T);
};
p_minor_nT_ion[tid] = nT_out;
//};
__syncthreads();
if (threadIdx.x < SIZE_OF_MAJOR_PER_TRI_TILE)
{
shared_nT[threadIdx.x] = p_nT_elec[blockIdx.x*SIZE_OF_MAJOR_PER_TRI_TILE + threadIdx.x];
}
__syncthreads();
//if (tid < Ntris) {
if ((corner_index.i1 >= StartMajor) && (corner_index.i1 < EndMajor))
{
nT1 = shared_nT[corner_index.i1-StartMajor];
} else {
// have to load in from global memory:
nT1 = p_nT_elec[corner_index.i1];
}
if ((corner_index.i2 >= StartMajor) && (corner_index.i2 < EndMajor))
{
nT2 = shared_nT[corner_index.i2-StartMajor];
} else {
// have to load in from global memory:
nT2 = p_nT_elec[corner_index.i2];
}
if ((corner_index.i3 >= StartMajor) && (corner_index.i3 < EndMajor))
{
nT3 = shared_nT[corner_index.i3-StartMajor];
} else {
// have to load in from global memory:
nT3 = p_nT_elec[corner_index.i3];
}
if (per_info.flag == CROSSING_INS) {
// An idea: Ensure that outside the domain, n is recorded as 0
int divide = 0.0;
nT_out.n = 0.0;
nT_out.T = 0.0;
if (nT1.n > 0.0) {
nT_out.n += nT1.n;
nT_out.T += nT1.T;
divide++;
}
if (nT2.n > 0.0) {
nT_out.n += nT2.n;
nT_out.T += nT2.T;
divide++;
}
if (nT3.n > 0.0) {
nT_out.n += nT3.n;
nT_out.T += nT3.T;
divide++;
}
nT_out.n /= (real)divide;
nT_out.T /= (real)divide;
} else {
nT_out.n = THIRD*(nT1.n+nT2.n+nT3.n);
nT_out.T = THIRD*(nT1.T+nT2.T+nT3.T);
};
p_minor_nT_elec[tid] = nT_out;
// if frills have corners repeated, we end up with 1/3+2/3 --- should never matter.
// If special vertex, probably we set nT at special vertex to 0 so 1/3+1/3.
// nT should not be important at frills, as outermost points and innermost points
// do not need to know pressure.
}
__global__ void Kernel_GetZCurrent(
CHAR4 * __restrict__ p_minor_info,
nT * __restrict__ p_minor_nT_ion,
nT * __restrict__ p_minor_nT_elec,
f64_vec3 * __restrict__ p_minor_v_ion,
f64_vec3 * __restrict__ p_minor_v_elec, // Not clear if this should be nv or {n,v} ? {n,v}
f64 * __restrict__ p_area_minor,
f64 * __restrict__ p_summands )
{
__shared__ f64 intrablock[threadsPerTileMinor];
long tid = threadIdx.x + blockIdx.x * blockDim.x;
CHAR4 minor_info = p_minor_info[tid];
// This is called for all minor cells.
if ((minor_info.flag == DOMAIN_MINOR) || (minor_info.flag == OUTERMOST)) {
// Let DOMAIN_MINOR == DOMAIN_TRIANGLE ...
// And if you are DOMAIN_MINOR then n,v should be meaningful.
// Other possible values:
// OUTERMOST_CENTRAL == OUTERMOST, OUTER_FRILL, INNERMOST_CENTRAL, INNER_FRILL, INNER_TRIANGLE,
// CROSSING_INS, INNER_CENTRAL --
f64 n_ion = p_minor_nT_ion[tid].n;
f64 n_e = p_minor_nT_elec[tid].n;
f64_vec3 v_ion = p_minor_v_ion[tid];
f64_vec3 v_e = p_minor_v_elec[tid];
f64 Iz = q*(n_ion*v_ion.z - n_e*v_e.z)*p_area_minor[tid];
// Lots of bus loads, hopefully all contig.
intrablock[threadIdx.x] = Iz;
// HERE ASSUMED that area is calculated as DOMAIN INTERSECTION AREA
// if we start including nv in insulator-crossing tris.
} else {
intrablock[threadIdx.x] = 0.0;
};
__syncthreads();
// Now it's the aggregation:
int s = blockDim.x;
int k = s/2;
while (s != 1) {
if (threadIdx.x < k)
{
intrablock[threadIdx.x] += intrablock[threadIdx.x + k];
};
__syncthreads();
// Attempt to modify:
if ((s % 2 == 1) && (threadIdx.x == k-1)){
intrablock[threadIdx.x] += intrablock[threadIdx.x+s-1];
};
// In case k == 81, add [39] += [80]
// Otherwise we only get to 39+40=79.
s = k;
k = s/2;
__syncthreads();
};
if (threadIdx.x == 0)
{
p_summands[blockIdx.x] = intrablock[0];
};
} // Doesn't matter much if function is slow, I think it is only called for debug purposes anyway.
__global__ void Kernel_Create_v_overall_and_newpos(
structural * __restrict__ p_info,
f64 const h,
nT * __restrict__ p_nT_neut,
nT * __restrict__ p_nT_ion,
nT * __restrict__ p_nT_elec,
f64_vec3 * __restrict__ p_v_neut,
f64_vec3 * __restrict__ p_v_ion,
f64_vec3 * __restrict__ p_v_elec,
// Output:
structural * __restrict__ p_info_out,
f64_vec2 * __restrict__ p_v_overall
)
{
long tid = threadIdx.x + blockIdx.x * blockDim.x;
//if (tid < Nverts)
structural info = p_info[tid];
f64_vec2 v_save;
if (info.flag == DOMAIN_VERTEX)
{
nT nT_neut, nT_ion, nT_elec;
f64_vec3 v_n, v_i, v_e;
nT_neut = p_nT_neut[tid];
nT_ion = p_nT_ion[tid];
nT_elec = p_nT_elec[tid];
v_n = p_v_neut[tid];
v_i = p_v_ion[tid];
v_e = p_v_elec[tid]; // expensive loads; can we avoid function by putting it in with smth else?
f64_vec3 v_overall = (m_n*nT_neut.n*v_n + m_ion*nT_ion.n*v_i + m_e*nT_elec.n*v_e)/
(m_n*nT_neut.n + m_ion*nT_ion.n + m_e*nT_elec.n);
v_save.x = v_overall.x;
v_save.y = v_overall.y;
info.pos += h*v_save;
} else {
v_save.x = 0.0; v_save.y = 0.0;
}
p_v_overall[tid] = v_save;
p_info_out[tid] = info; // safer to do unnecessary write of whole object to get contiguity.
// can we do anything else with the data?
// We could transfer it to shared and do something with it. But there isn't anything.
}
__global__ void Kernel_Average_v_overall_to_tris (
LONG3 * __restrict__ p_tri_corner_index,
CHAR4 * __restrict__ p_tri_perinfo,
f64_vec2 * __restrict__ p_v_overall,
f64_vec2 * __restrict__ p_tri_centroid,
// Output:
f64_vec2 * __restrict__ p_minor_v_overall
)
{
__shared__ f64_vec2 shared_v[SIZE_OF_MAJOR_PER_TRI_TILE];
// Averaging as 1/3 to tris.
// Even corner positions do not tell us intersection. We'd have to know the neighbouring
// centroid also.
// Load to shared:
if (threadIdx.x < SIZE_OF_MAJOR_PER_TRI_TILE)
{
shared_v[threadIdx.x] = p_v_overall[blockIdx.x*SIZE_OF_MAJOR_PER_TRI_TILE + threadIdx.x];
}
// Let's hope it works with that sort of index. If it doesn't we're in a tough situation.
long StartMajor = blockIdx.x*SIZE_OF_MAJOR_PER_TRI_TILE;
long EndMajor = StartMajor + SIZE_OF_MAJOR_PER_TRI_TILE;
long tid = threadIdx.x + blockIdx.x * blockDim.x;
__syncthreads();
f64_vec2 v0, v1, v2, v_out;
LONG3 corner_index;
CHAR4 perinfo;
//if (tid < Ntris) { // redundant check
corner_index = p_tri_corner_index[tid];
perinfo = p_tri_perinfo[tid];
if ((perinfo.flag == DOMAIN_TRIANGLE) ||
(perinfo.flag == CROSSING_INS))
{
if ((corner_index.i1 >= StartMajor) && (corner_index.i1 < EndMajor))
{
v0 = shared_v[corner_index.i1-StartMajor];
} else {
// have to load in from global memory:
v0 = p_v_overall[corner_index.i1];
}
if ((corner_index.i2 >= StartMajor) && (corner_index.i2 < EndMajor))
{
v1 = shared_v[corner_index.i2-StartMajor];
} else {
v1 = p_v_overall[corner_index.i2];
}
if ((corner_index.i3 >= StartMajor) && (corner_index.i3 < EndMajor))
{
v2 = shared_v[corner_index.i3-StartMajor];
} else {
v2 = p_v_overall[corner_index.i3];
}
if (perinfo.per0+perinfo.per1+perinfo.per2 == 0) {
} else {
// In this case which ones are periodic?
// Should we just store per flags?
// How it should work:
// CHAR4 perinfo: periodic, per0, per1, per2;
if (perinfo.per0 == NEEDS_ANTI)
v0 = Anticlock_rotate2(v0);
if (perinfo.per0 == NEEDS_CLOCK)
v0 = Clockwise_rotate2(v0);
if (perinfo.per1 == NEEDS_ANTI)
v1 = Anticlock_rotate2(v1);
if (perinfo.per1 == NEEDS_CLOCK)
v1 = Clockwise_rotate2(v1);
if (perinfo.per2 == NEEDS_ANTI)
v2 = Anticlock_rotate2(v2);
if (perinfo.per2 == NEEDS_CLOCK)
v2 = Clockwise_rotate2(v2);
};
v_out = THIRD*(v0+v1+v2);
// For insulator triangle,
// we should take v_overall_r = 0
// because this tri centroid will remain on the insulator.
// It is OK to average with places that should have v_overall = 0.
if (perinfo.flag == CROSSING_INS)
{
f64_vec2 r = p_tri_centroid[tid]; // random accesses??
//f64_vec2 rhat = r/r.modulus();
// v_out = v_out - rhat*v_out.dot(rhat);
v_out = v_out - r*v_out.dot(r)/(r.x*r.x+r.y*r.y);
// Well this is kinda wrong.
}
} else {
v_out.x = 0.0; v_out.y = 0.0;
}
p_minor_v_overall[tid] = v_out;
}
__global__ void Kernel_Average_nnionrec_to_tris
(
CHAR4 * __restrict__ p_tri_perinfo,
LONG3 * __restrict__ p_tri_corner_index,
nn * __restrict__ p_nn_ionrec,
nn * __restrict__ p_nn_ionrec_minor
)
{
__shared__ nn shared_nn[SIZE_OF_MAJOR_PER_TRI_TILE];
if (threadIdx.x < SIZE_OF_MAJOR_PER_TRI_TILE)
{
shared_nn[threadIdx.x] = p_nn_ionrec[blockIdx.x*SIZE_OF_MAJOR_PER_TRI_TILE + threadIdx.x];
}
// Let's hope it works with that sort of index. If it doesn't we're in a tough situation.
long StartMajor = blockIdx.x*SIZE_OF_MAJOR_PER_TRI_TILE;
long EndMajor = StartMajor + SIZE_OF_MAJOR_PER_TRI_TILE;
long tid = threadIdx.x + blockIdx.x * blockDim.x;
__syncthreads();
nn nn0, nn1, nn2;
LONG3 corner_index;
nn nn_out;
//if (tid < Ntris) { // redundant check - ?
corner_index = p_tri_corner_index[tid];
CHAR4 perinfo = p_tri_perinfo[tid];
if ((perinfo.flag == DOMAIN_TRIANGLE) ||
(perinfo.flag == CROSSING_INS))
{
if ((corner_index.i1 >= StartMajor) && (corner_index.i1 < EndMajor))
{
nn0 = shared_nn[corner_index.i1-StartMajor];
} else {
// have to load in from global memory:
nn0 = p_nn_ionrec[corner_index.i1];
}
if ((corner_index.i2 >= StartMajor) && (corner_index.i2 < EndMajor))
{
nn1 = shared_nn[corner_index.i2-StartMajor];
} else {
nn1 = p_nn_ionrec[corner_index.i2];
}
if ((corner_index.i3 >= StartMajor) && (corner_index.i3 < EndMajor))
{
nn2 = shared_nn[corner_index.i3-StartMajor];
} else {
nn2 = p_nn_ionrec[corner_index.i3];
}
nn_out.n_ionise = THIRD*(nn0.n_ionise+nn1.n_ionise+nn2.n_ionise);
nn_out.n_recombine = THIRD*(nn0.n_recombine+nn1.n_recombine+nn2.n_recombine);
if (perinfo.flag == CROSSING_INS)
{
// Ensure that we are not using silly data...
// Assume n_ionise = 0 outside domain.
nn_out.n_ionise = 0.5*(nn0.n_ionise+nn1.n_ionise+nn2.n_ionise);
nn_out.n_recombine = 0.5*(nn0.n_recombine+nn1.n_recombine+nn2.n_recombine);
}
} else {
nn_out.n_ionise = 0.0;
nn_out.n_recombine = 0.0;
}
p_nn_ionrec_minor[tid] = nn_out;
}