-
Notifications
You must be signed in to change notification settings - Fork 0
/
direct_invsqrt.cpp
425 lines (339 loc) · 13.6 KB
/
direct_invsqrt.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
// The DIRECT-1 algorithm for minimizing a multidimensional function.
// (searching for parameters of the inverse sqrt hack).
// Implementation 2010-02-13 by Jan "Rrrola" Kadlec. Public Domain.
#include <cmath>
#include <vector>
#include <queue>
#include <algorithm>
using std::max;
using std::min;
using std::sort;
// A point with fixed dimension.
template<int DIM>
struct Point
{ double x[DIM];
double& operator[](int i) { return x[i]; }
double operator[](int i) const { return x[i]; }
// Construct a Point from DIM doubles (hack!).
Point(){}
Point(double x0) { x[0] = x0; }
Point(double x0, double x1) { x[0] = x0; x[1] = x1; }
Point(double x0, double x1, double x2) { x[0] = x0; x[1] = x1; x[2] = x2; }
Point(double x0, double x1, double x2, double x3) { x[0] = x0; x[1] = x1; x[2] = x2; x[3] = x3; }
Point(double x0, double x1, double x2, double x3, double x4) { x[0] = x0; x[1] = x1; x[2] = x2; x[3] = x3; x[4] = x4; }
};
// A boolean point with fixed dimension.
template<int DIM>
struct BoolPoint
{ bool x[DIM];
bool& operator[](int i) { return x[i]; }
bool operator[](int i) const { return x[i]; }
};
// A rectangle with an evaluated center - the basis of DIRECT.
// Allowed side lengths are (1/3)^level and (1/3)^(level+1).
template<int DIM>
struct Rectangle
{ double f; // value at x[]
Point<DIM> x; // center coords
BoolPoint<DIM> l; // true = longer side, false = shorter side
};
template<int DIM>
bool operator<(Rectangle<DIM> const& a, Rectangle<DIM> const& b) { return a.f>b.f; }
// For sorting coordinates.
struct Index { double f; int index; };
bool operator<(Index const& a, Index const& b) { return a.f<b.f; }
//////////////////////////////////////////////////////////////////////////////
// Aggressive DIRECT-1.
// We have a pool of (hyper-)rectangles with evaluated f() in the center.
// Rectangles have a "level" according to their size, every side can be
// "long" or "short" (long = 3*short).
// Initial rectangle = whole area.
// 1. Choose rectangles to subdivide. For each:
// 2. Evaluate f(center +- side_length[d]/3) for all "long" dimensions.
// 3. Split central rectangle along all dimensions, starting from dims with
// the best f().
// 4. Repeat.
// "-1" means that level = log(longest side). The original version
// with level = ||center-vertex|| is too slow in aggressive mode.
// "Aggressive" means that we split the top rectangle in every level
// without regard to its f(). This hurts only speed and simplifies
// everything (the original version needs to compute the convex hull).
// - My improvement 1: just use a priority queue ;-)
// - My improvement 2: split only levels<threshold. Threshold changes when
// solution didn't improve for a long time (Iterative Deepening FTW)
template<int DIM>
struct Direct
{ std::vector< Point<DIM> > scale; // prec tables of length[d] * (1/3)^x
std::vector< std::priority_queue< Rectangle<DIM> > > rectangle_set;
double (*eval)(Point<DIM>&); int evals;
double best_f; Point<DIM> best; int best_level;
int maxlevel;
void call_eval(Rectangle<DIM>& r, int level)
{ r.f = eval(r.x);
if (best_f > r.f) best_f = r.f, best = r.x, best_level = level;
evals++;
}
// Add a new rectangle into the pool.
void add(Rectangle<DIM> const& r, int level)
{ if (rectangle_set.size() < level+1) rectangle_set.resize(level+1);
rectangle_set[level].push(r);
}
// Initialize DIRECT with a rectangle covering the whole range.
Direct(Point<DIM> const& l, Point<DIM> const& u, double (*eval)(Point<DIM>&))
: best_f(HUGE_VAL), evals(0), eval(eval), rectangle_set(1), scale(1), maxlevel(4)
{ Rectangle<DIM> init;
for (int i=0; i<DIM; i++) init.x[i] = 0.5 * (l[i]+u[i]); // initial point
for (int i=0; i<DIM; i++) init.l[i] = true;
for (int i=0; i<DIM; i++) scale[0][i] = (u[i]-l[i]) * (1.0/3.0);
call_eval(init, 0);
add(init, 0);
}
// Pick the best rectangle in a given level and split it
void split_top(int level)
{ if (level > maxlevel) return;
Rectangle<DIM> r = rectangle_set[level].top();
rectangle_set[level].pop();
// Do shift tables need precomputation?
if (scale.size() < level+1)
{ int size = scale.size();
scale.resize(level+1);
for ( ; size < scale.size(); ++size)
for (int i=0; i<DIM; ++i) scale[size][i] = scale[size-1][i] * (1.0/3.0);
}
Rectangle<DIM> pos[DIM], neg[DIM];
Index w[DIM];
// For every long dimension of the rectangle:
// evaluate f(center+e_d) and f(center-e_d)
int count=0;
for (int i=0; i<DIM; i++) if (r.l[i])
{ pos[i].x = r.x; pos[i].x[i] += scale[level][i]; call_eval(pos[i], level);
neg[i].x = r.x; neg[i].x[i] -= scale[level][i]; call_eval(neg[i], level);
w[count].f = min(pos[i].f, neg[i].f);
w[count].index = i;
count++;
}
// Repeat:
// Split the rectangle containing the center into thirds along
// the dimension with lowest f(center+-e_d).
// Add the resulting rectangles into the pool
// (level = old level except for the last split, which gets level+1).
std::sort(w, w+count);
int i,j;
BoolPoint<DIM> l = r.l;
for (j=0; j<count-1; j++)
{ i = w[j].index;
l[i] = false;
pos[i].l = l; add(pos[i], level);
neg[i].l = l; add(neg[i], level);
}
for (i=0; i<DIM; i++) l[i] = true;
i = w[j].index;
pos[i].l = l; add(pos[i], level+1);
neg[i].l = l; add(neg[i], level+1);
r.l = l; add(r, level+1);
}
// Aggressive split: split the best rectangle in every level.
void iter(void)
{ for (int i=min(rectangle_set.size()-1, (unsigned)maxlevel); i>=0; --i)
if (!rectangle_set[i].empty())
split_top(i);
}
};
//////////////////////////////////////////////////////////////////////////////
#include "common.h"
union Float { float f; uint32 u; };
uint32 c1;
float c2, c3;
float inv_sqrt(float x)
{ Float y = {x};
y.u = c1 - (y.u >> 1);
return c2 * y.f * (c3 - x * y.f * y.f);
}
double relative_error(float f)
{
return fabs(1.0 - (inv_sqrt(f) * sqrt(f)));
}
/////////////////////////////////////////
#define INITIAL_STEP 512
#define THOROUGH_PART 48
// Find the max relative error of a parameter set.
// Uses rough seeding at first, then scans around the highest seeds.
double max_relative_error(uint32 _c1, float _c2, float _c3)
{ c1 = _c1; c2 = _c2; c3 = _c3;
Index errors[16777216/INITIAL_STEP];
Float x;
double maxerror = -HUGE_VAL;
// initial seeding
int i,j;
for (i=0, x.f=1.0, x.u+=INITIAL_STEP/2; x.f<4.0f; x.u+=INITIAL_STEP, i++)
{ double error = relative_error(x.f);
errors[i].f = -error;
errors[i].index = x.u;
}
// find the seeds belonging to the highest (1/THOROUGH_PART)
std::partial_sort(errors, errors+16777216/INITIAL_STEP/THOROUGH_PART, errors+16777216/INITIAL_STEP);
// and thoroughly scan their surroundings
for (i=0; i<16777216/INITIAL_STEP/THOROUGH_PART; i++)
{ for (x.u = errors[i].index-INITIAL_STEP/2; x.u < errors[i].index+INITIAL_STEP/2; x.u++)
{ double error = relative_error(x.f);
maxerror = max(maxerror, error);
}
}
return maxerror;
}
// Find the max relative error of a parameter set. Thorough scan (slow!).
double max_relative_error_exact(uint32 _c1, float _c2, float _c3)
{ c1 = _c1; c2 = _c2; c3 = _c3;
Float x;
double maxerror = -HUGE_VAL;
for (x.f=1.0; x.f<4.0f; x.u++)
{ double error = relative_error(x.f);
maxerror = max(maxerror, error);
}
return maxerror;
}
// Find the avg squared relative error of a parameter set.
// Simple rough seeding works well.
double avg_relative_error(uint32 _c1, float _c2, float _c3)
{ c1 = _c1; c2 = _c2; c3 = _c3;
Float x;
double sumerror = 0, sumerror2 = 0;
int cnt = 0;
for (x.f=1.0; x.f<4.0f; x.u+=79)
{ double error = relative_error(x.f);
sumerror += error*error;
cnt++;
if ((cnt & 0xfff) == 0) sumerror2 += sumerror, sumerror = 0;
}
return (sumerror2 + sumerror) / cnt;
}
// Find the avg squared relative error of a parameter set. Thorough scan (slow!).
double avg_relative_error_exact(uint32 _c1, float _c2, float _c3)
{ c1 = _c1; c2 = _c2; c3 = _c3;
Float x;
double sumerror = 0, sumerror2 = 0;
int cnt = 0;
for (x.f=1.0; x.f<4.0f; x.u++)
{ double error = relative_error(x.f);
sumerror += error*error;
cnt++;
if ((cnt & 0xfff) == 0) sumerror2 += sumerror, sumerror = 0;
}
return (sumerror2 + sumerror) / cnt;
}
#define DIMENSION 3
double eval(Point<DIMENSION>& p)
{
return max_relative_error((uint32)p[0], p[1], p[2]);
// return avg_relative_error((uint32)p[0], p[1], p[2]);
}
#include "random.c"
int main(void)
{
// min avg (relative_error^2)
// Point<DIMENSION> low(0x5F1ACF80+randomAB(-10,10), 0.75589+random11()*0.000005, 2.27823+random11()*0.000005),
// high(0x5F1AD100+randomAB(-10,10), 0.75592+random11()*0.000005, 2.27833+random11()*0.000005);
// min max relative_error
// Point<DIMENSION> low(0x5F1FF800+randomAB(-100,100), 0.7036+random11()*0.00002, 2.3901+random11()*0.00002),
// high(0x5F200800+randomAB(-100,100), 0.7043+random11()*0.00002, 2.3884+random11()*0.00002);
Point<DIMENSION> low(0x5F601000+randomAB(-100,100), 0.24845+random11()*0.00002, 4.7825+random11()*0.00002),
high(0x5F602800+randomAB(-100,100), 0.24855+random11()*0.00002, 4.7840+random11()*0.00002);
Direct<DIMENSION> D(low, high, eval);
FILE* fw = fopen("direct_invsqrt.log.txt", "a"); if (!fw) return -1;
int no_changes = 0, solution_age = 0, last_chars = 0, solution_chars = 0;
double last_best = HUGE_VAL;
for (int i=1; ; i++)
{
char s[128];
snprintf(s, sizeof(s), "%d:%d:%d%n", i, D.evals, D.maxlevel, &last_chars);
printf("\r%s", s);
last_best = D.best_f;
D.iter();
// No improvement?
if (last_best == D.best_f)
{
// If a few iters were unsuccessful, eval exactly and save current parameter set.
if (++solution_age == 4)
// { double error = avg_relative_error_exact((uint32)D.best[0], D.best[1], D.best[2]);
{ double error = max_relative_error_exact((uint32)D.best[0], D.best[1], D.best[2]);
printf("\r"); for (int j=0; j<solution_chars-5; j++) printf(" ");
printf("exact f=%.17g\n", error);
fprintf(fw, "0x%8X %.9g %.9g %.17g\n", (uint32)D.best[0], D.best[1], D.best[2], error);
}
// Random search: permute params with gaussian noise in several variance scales.
// If better solution found, restart DIRECT centered on it (merged).
// Beware: this can crawl out of the original bounds! I liked that, but
// you might want to all a test to prevent that.
if (solution_age >= 16 && solution_age % 16 == 0)
{ uint k1 = D.best[0]; Float k2 = {D.best[1]}, k3 = {D.best[2]};
uint l1; Float l2, l3;
double min_e = D.best_f;
bool changed_h = false;
for (int h=256; h!=0; )
{ bool changed = false;
printf("$");
for (int i=0; i<8; i++)
{ l1 = k1 + randomNormal(0, h);
l2.u = k2.u + randomNormal(0, h);
l3.u = k3.u + randomNormal(0, h);
double e = max_relative_error(l1, l2.f, l3.f);
if (min_e > e)
{ D.best_f = min_e = e, changed = changed_h = true;
D.best[0] = k1 = l1, D.best[1] = l2.f, k2.u = l2.u, D.best[2] = l3.f, k3.u = l3.u;
printf("\r Rf=%.12g x=[0x%8X %.9g %.9g]\n", min_e, k1, k2.f, k3.f);
}
}
if (!changed) h>>=1;
}
if (changed_h)
{ Rectangle<3> r = {min_e};
r.x[0] = k1; r.x[1] = k2.f; r.x[2] = k3.f;
r.l[0] = r.l[1] = r.l[2] = true;
D.add(r, 1);
solution_age = no_changes = 0;
}
}
// If not improved in two random searches, try local deterministic grid search.
if (solution_age == 33)
{ uint k1 = D.best[0]; Float k2 = {D.best[1]}, k3 = {D.best[2]};
uint l1; Float l2, l3;
double min_e = D.best_f;
bool changed_h = false;
for (int h=256; h!=0; )
{ bool changed = false;
printf("#");
for (int i=-h; i<=h; i+=h)
for (int j=-h; j<=h; j+=h)
for (int k=-h; k<=h; k+=h)
{ l1 = k1 + i;
l2.u = k2.u + j;
l3.u = k3.u + k;
double e = max_relative_error(l1, l2.f, l3.f);
if (min_e > e)
{ D.best_f = min_e = e, changed = changed_h = true;
D.best[0] = k1 = l1, D.best[1] = l2.f, k2.u = l2.u, D.best[2] = l3.f, k3.u = l3.u;
printf("\r Lf=%.12g x=[0x%8X %.9g %.9g]\n", min_e, k1, k2.f, k3.f);
}
}
if (!changed) h>>=1;
}
if (changed_h)
{ Rectangle<3> r = {min_e};
r.x[0] = k1; r.x[1] = k2.f; r.x[2] = k3.f;
r.l[0] = r.l[1] = r.l[2] = true;
D.add(r, 1);
solution_age = no_changes = 0;
}
}
// If no improvement for a long time, increase max rectangle level to split.
if (++no_changes >= 30 && D.maxlevel < 10) D.maxlevel++, no_changes = 0;
}
// Found better solution! Joy!
else
{ solution_age = no_changes = 0;
solution_chars = last_chars;
printf(" f=%.12g x=[0x%8X %.9g %.9g] l=%d\n", D.best_f, (uint32)D.best[0], D.best[1], D.best[2], D.best_level);
}
}
return 0;
}