Skip to content

Commit

Permalink
fallback sync fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
hannorein committed Aug 22, 2024
1 parent fcee788 commit 20d7676
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 48 deletions.
8 changes: 5 additions & 3 deletions examples/whfast512_unittests/problem.c
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ int test_synchronization_fallback(){
reb_simulation_free(r512);

struct reb_simulation* r1 = reb_simulation_create_from_file("test.bin", 1);
struct reb_simulation* r2 = reb_simulation_create_from_file("test.bin", 2);
struct reb_simulation* r2 = reb_simulation_create_from_file("test.bin", 1);
assert(r1->ri_whfast512.is_synchronized == 0);
assert(r2->ri_whfast512.is_synchronized == 0);
reb_simulation_synchronize(r1);
Expand All @@ -322,9 +322,11 @@ int test_synchronization_fallback(){
assert(r2->ri_whfast512.is_synchronized == 1);

for (int i=0;i<r1->N;i++){
if (fabs(r1->particles[i].x - r2->particles[i].x)>1e-14){
double dx = fabs(r1->particles[i].x - r2->particles[i].x);
double dvx = fabs(r1->particles[i].x - r2->particles[i].x);
if (dx>1e-16 || dvx>1e-16){
printf("Accuracy not met in synchronization fallback test.\n");
printf("i=%i diff_x=%.16e\n",i,fabs(r1->particles[i].x - r2->particles[i].x));
printf("i=%i diff_x=%.16e diff_vx=%.16e\n",i,dx, dvx);
return 0;
}
}
Expand Down
152 changes: 107 additions & 45 deletions src/integrator_whfast512.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@
#include "integrator_whfast.h"
#include "integrator_whfast512.h"

#ifdef AVX512

#ifdef PROF
// Profiling counters
double walltime_interaction=0;;
Expand All @@ -48,6 +46,27 @@ double walltime_jump=0;
double walltime_com=0;
#endif

// Performs one full center of mass step (H_0)
static void reb_whfast512_com_step(struct reb_simulation* r, const double _dt){
#ifdef PROF
struct reb_timeval time_beginning;
gettimeofday(&time_beginning,NULL);
#endif
const unsigned int N_systems = r->ri_whfast512.N_systems;
for (int s=0; s<N_systems; s++){
r->ri_whfast512.p_jh0[s].x += _dt*r->ri_whfast512.p_jh0[s].vx;
r->ri_whfast512.p_jh0[s].y += _dt*r->ri_whfast512.p_jh0[s].vy;
r->ri_whfast512.p_jh0[s].z += _dt*r->ri_whfast512.p_jh0[s].vz;
}
#ifdef PROF
struct reb_timeval time_end;
gettimeofday(&time_end,NULL);
walltime_com += time_end.tv_sec-time_beginning.tv_sec+(time_end.tv_usec-time_beginning.tv_usec)/1e6;
#endif
}

#ifdef AVX512

// Fast inverse factorial lookup table
static const double invfactorial[35] = {1., 1., 1./2., 1./6., 1./24., 1./120., 1./720., 1./5040., 1./40320., 1./362880., 1./3628800., 1./39916800., 1./479001600., 1./6227020800., 1./87178291200., 1./1307674368000., 1./20922789888000., 1./355687428096000., 1./6402373705728000., 1./121645100408832000., 1./2432902008176640000., 1./51090942171709440000., 1./1124000727777607680000., 1./25852016738884976640000., 1./620448401733239439360000., 1./15511210043330985984000000., 1./403291461126605635584000000., 1./10888869450418352160768000000., 1./304888344611713860501504000000., 1./8841761993739701954543616000000., 1./265252859812191058636308480000000., 1./8222838654177922817725562880000000., 1./263130836933693530167218012160000000., 1./8683317618811886495518194401280000000., 1./295232799039604140847618609643520000000.};

Expand Down Expand Up @@ -830,25 +849,6 @@ static void reb_whfast512_jump_step(struct reb_simulation* r, const double _dt){
#endif
}

// Performs one full center of mass step (H_0)
static void reb_whfast512_com_step(struct reb_simulation* r, const double _dt){
#ifdef PROF
struct reb_timeval time_beginning;
gettimeofday(&time_beginning,NULL);
#endif
const unsigned int N_systems = r->ri_whfast512.N_systems;
for (int s=0; s<N_systems; s++){
r->ri_whfast512.p_jh0[s].x += _dt*r->ri_whfast512.p_jh0[s].vx;
r->ri_whfast512.p_jh0[s].y += _dt*r->ri_whfast512.p_jh0[s].vy;
r->ri_whfast512.p_jh0[s].z += _dt*r->ri_whfast512.p_jh0[s].vz;
}
#ifdef PROF
struct reb_timeval time_end;
gettimeofday(&time_end,NULL);
walltime_com += time_end.tv_sec-time_beginning.tv_sec+(time_end.tv_usec-time_beginning.tv_usec)/1e6;
#endif
}

// Precalculate various constants and put them in 512 bit vectors.
void static recalculate_constants(struct reb_simulation* r){
struct reb_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512);
Expand Down Expand Up @@ -1060,6 +1060,67 @@ void reb_integrator_whfast512_synchronize(struct reb_simulation* const r){
#endif // AVX512
}

// Convert democratic heliocentric coordinates to inertial coordinates
// Fallback for synchronization for when AVX512 is not available
static void democraticheliocentric_to_inertial_posvel_fallback(struct reb_simulation* r){
struct reb_integrator_whfast512* const ri_whfast512 = &(r->ri_whfast512);
struct reb_particle* particles = r->particles;
struct reb_particle_avx512* p_jh = ri_whfast512->p_jh;
const unsigned int N_systems = ri_whfast512->N_systems;
const unsigned int p_per_system = 8/N_systems;
const unsigned int N_per_system = r->N/N_systems;

double m[8];
double x[8];
double y[8];
double z[8];
double vx[8];
double vy[8];
double vz[8];
memcpy(&m, &p_jh->m, sizeof(double)*8);
memcpy(&x, &p_jh->x, sizeof(double)*8);
memcpy(&y, &p_jh->y, sizeof(double)*8);
memcpy(&z, &p_jh->z, sizeof(double)*8);
memcpy(&vx, &p_jh->vx, sizeof(double)*8);
memcpy(&vy, &p_jh->vy, sizeof(double)*8);
memcpy(&vz, &p_jh->vz, sizeof(double)*8);

for (unsigned s=0;s<N_systems;s++){
double x0s = 0;
double y0s = 0;
double z0s = 0;
double vx0s = 0;
double vy0s = 0;
double vz0s = 0;
for (unsigned int i=1; i<N_per_system; i++){
x0s += x[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
y0s += y[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
z0s += z[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
vx0s += vx[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
vy0s += vy[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
vz0s += vz[s*p_per_system+(i-1)] * m[s*p_per_system+(i-1)];
particles[s*N_per_system+i].vx = vx[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vx;
particles[s*N_per_system+i].vy = vy[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vy;
particles[s*N_per_system+i].vz = vz[s*p_per_system+(i-1)] + ri_whfast512->p_jh0[s].vz;
}
x0s /= ri_whfast512->p_jh0[s].m;
y0s /= ri_whfast512->p_jh0[s].m;
z0s /= ri_whfast512->p_jh0[s].m;
particles[s*N_per_system].x = ri_whfast512->p_jh0[s].x - x0s;
particles[s*N_per_system].y = ri_whfast512->p_jh0[s].y - y0s;
particles[s*N_per_system].z = ri_whfast512->p_jh0[s].z - z0s;
particles[s*N_per_system].vx = ri_whfast512->p_jh0[s].vx - vx0s;
particles[s*N_per_system].vy = ri_whfast512->p_jh0[s].vy - vy0s;
particles[s*N_per_system].vz = ri_whfast512->p_jh0[s].vz - vz0s;
for (unsigned int i=1; i<N_per_system; i++){
particles[s*N_per_system+i].x = x[s*p_per_system+(i-1)] + particles[s*N_per_system].x;
particles[s*N_per_system+i].y = y[s*p_per_system+(i-1)] + particles[s*N_per_system].y;
particles[s*N_per_system+i].z = z[s*p_per_system+(i-1)] + particles[s*N_per_system].z;
}
}
}


void reb_integrator_whfast512_synchronize_fallback(struct reb_simulation* const r){
// No AVX512 available
// Using WHFast as a workaround.
Expand All @@ -1070,34 +1131,35 @@ void reb_integrator_whfast512_synchronize_fallback(struct reb_simulation* const
const unsigned int N_systems = ri_whfast512->N_systems;
const unsigned int p_per_system = 8/N_systems;
const unsigned int N_per_system = r->N/N_systems;
double dt = r->dt;
for (int s=0; s<N_systems; s++){
struct reb_simulation* rs = reb_simulation_create();
rs->G = r->G;
for (unsigned int i=0;i<=N_per_system;i++){
reb_simulation_add(rs, r->particles[s*N_per_system+i]);
}
struct reb_integrator_whfast* const ri_whfast = &(rs->ri_whfast);
reb_integrator_whfast_init(rs);
ri_whfast->coordinates = REB_WHFAST_COORDINATES_DEMOCRATICHELIOCENTRIC;
ri_whfast->is_synchronized = 0;
ri_whfast->p_jh[0] = ri_whfast512->p_jh0[s];
double m0 = r->particles[s*N_per_system].m;
// 1/2 Kepler
for (unsigned int i=1;i<=N_per_system;i++){
ri_whfast->p_jh[i].m = ri_whfast512->p_jh->m[s*p_per_system+i-1];
ri_whfast->p_jh[i].x = ri_whfast512->p_jh->x[s*p_per_system+i-1];
ri_whfast->p_jh[i].y = ri_whfast512->p_jh->y[s*p_per_system+i-1];
ri_whfast->p_jh[i].z = ri_whfast512->p_jh->z[s*p_per_system+i-1];
ri_whfast->p_jh[i].vx = ri_whfast512->p_jh->vx[s*p_per_system+i-1];
ri_whfast->p_jh[i].vy = ri_whfast512->p_jh->vy[s*p_per_system+i-1];
ri_whfast->p_jh[i].vz = ri_whfast512->p_jh->vz[s*p_per_system+i-1];
struct reb_particle p = {0};
p.m = ri_whfast512->p_jh->m[s*p_per_system+i-1];
p.x = ri_whfast512->p_jh->x[s*p_per_system+i-1];
p.y = ri_whfast512->p_jh->y[s*p_per_system+i-1];
p.z = ri_whfast512->p_jh->z[s*p_per_system+i-1];
p.vx = ri_whfast512->p_jh->vx[s*p_per_system+i-1];
p.vy = ri_whfast512->p_jh->vy[s*p_per_system+i-1];
p.vz = ri_whfast512->p_jh->vz[s*p_per_system+i-1];
p.sim = r;
reb_whfast_kepler_solver(r, &p, m0, 0, dt/2.0);
ri_whfast512->p_jh->x[s*p_per_system+i-1] = p.x;
ri_whfast512->p_jh->y[s*p_per_system+i-1] = p.y;
ri_whfast512->p_jh->z[s*p_per_system+i-1] = p.z;
ri_whfast512->p_jh->vx[s*p_per_system+i-1] = p.vx;
ri_whfast512->p_jh->vy[s*p_per_system+i-1] = p.vy;
ri_whfast512->p_jh->vz[s*p_per_system+i-1] = p.vz;
}
reb_integrator_whfast_synchronize(rs);
for (unsigned int i=0;i<N_per_system;i++){
r->particles[s*N_per_system+i] = rs->particles[i];
r->particles[s*N_per_system+i].sim = r;
}
reb_simulation_free(rs);
}

if (ri_whfast512->recalculate_constants){
// Needed if no step has ever been done before (like SA)
recalculate_constants(r);
}
reb_whfast512_com_step(r, dt/2.0); // does extra work if N_systems>1, but not important.
democraticheliocentric_to_inertial_posvel_fallback(r);
ri_whfast512->is_synchronized = 1;
}
}
Expand Down

0 comments on commit 20d7676

Please sign in to comment.