-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnfa.cpp
260 lines (225 loc) · 8.97 KB
/
nfa.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
#include <math.h>
#include <float.h>
/*----------------------------------------------------------------------------*/
/** Doubles relative error factor
*/
#define RELATIVE_ERROR_FACTOR 100.0
/*----------------------------------------------------------------------------*/
/** Compare doubles by relative error.
The resulting rounding error after floating point computations
depend on the specific operations done. The same number computed by
different algorithms could present different rounding errors. For a
useful comparison, an estimation of the relative rounding error
should be considered and compared to a factor times EPS. The factor
should be related to the cumulated rounding error in the chain of
computation. Here, as a simplification, a fixed factor is used.
*/
static int double_equal(double a, double b)
{
double abs_diff,aa,bb,abs_max;
/* trivial case */
if( a == b ) return true;
abs_diff = fabs(a-b);
aa = fabs(a);
bb = fabs(b);
abs_max = aa > bb ? aa : bb;
/* DBL_MIN is the smallest normalized number, thus, the smallest
number whose relative error is bounded by DBL_EPSILON. For
smaller numbers, the same quantization steps as for DBL_MIN
are used. Then, for smaller numbers, a meaningful "relative"
error should be computed by dividing the difference by DBL_MIN. */
if( abs_max < DBL_MIN ) abs_max = DBL_MIN;
/* equal if relative error <= factor x eps */
return (abs_diff / abs_max) <= (RELATIVE_ERROR_FACTOR * DBL_EPSILON);
}
/*----------------------------------------------------------------------------*/
/*----------------------------- NFA computation ------------------------------*/
/*----------------------------------------------------------------------------*/
/*----------------------------------------------------------------------------*/
/** Computes the natural logarithm of the absolute value of
the gamma function of x using the Lanczos approximation.
See http://www.rskey.org/gamma.htm
The formula used is
@f[
\Gamma(x) = \frac{ \sum_{n=0}^{N} q_n x^n }{ \Pi_{n=0}^{N} (x+n) }
(x+5.5)^{x+0.5} e^{-(x+5.5)}
@f]
so
@f[
\log\Gamma(x) = \log\left( \sum_{n=0}^{N} q_n x^n \right)
+ (x+0.5) \log(x+5.5) - (x+5.5) - \sum_{n=0}^{N} \log(x+n)
@f]
and
q0 = 75122.6331530,
q1 = 80916.6278952,
q2 = 36308.2951477,
q3 = 8687.24529705,
q4 = 1168.92649479,
q5 = 83.8676043424,
q6 = 2.50662827511.
*/
static double log_gamma_lanczos(double x)
{
static double q[7] = { 75122.6331530, 80916.6278952, 36308.2951477,
8687.24529705, 1168.92649479, 83.8676043424,
2.50662827511 };
double a = (x+0.5) * log(x+5.5) - (x+5.5);
double b = 0.0;
int n;
for(n=0;n<7;n++)
{
a -= log( x + (double) n );
b += q[n] * pow( x, (double) n );
}
return a + log(b);
}
/*----------------------------------------------------------------------------*/
/** Computes the natural logarithm of the absolute value of
the gamma function of x using Windschitl method.
See http://www.rskey.org/gamma.htm
The formula used is
@f[
\Gamma(x) = \sqrt{\frac{2\pi}{x}} \left( \frac{x}{e}
\sqrt{ x\sinh(1/x) + \frac{1}{810x^6} } \right)^x
@f]
so
@f[
\log\Gamma(x) = 0.5\log(2\pi) + (x-0.5)\log(x) - x
+ 0.5x\log\left( x\sinh(1/x) + \frac{1}{810x^6} \right).
@f]
This formula is a good approximation when x > 15.
*/
static double log_gamma_windschitl(double x)
{
return 0.918938533204673 + (x-0.5)*log(x) - x
+ 0.5*x*log( x*sinh(1/x) + 1/(810.0*pow(x,6.0)) );
}
/*----------------------------------------------------------------------------*/
/** Computes the natural logarithm of the absolute value of
the gamma function of x. When x>15 use log_gamma_windschitl(),
otherwise use log_gamma_lanczos().
*/
#define log_gamma(x) ((x)>15.0?log_gamma_windschitl(x):log_gamma_lanczos(x))
/*----------------------------------------------------------------------------*/
/** Size of the table to store already computed inverse values.
*/
#define TABSIZE 100000
/*----------------------------------------------------------------------------*/
/** Computes -log10(NFA).
NFA stands for Number of False Alarms:
@f[
\mathrm{NFA} = NT \cdot B(n,k,p)
@f]
- NT - number of tests
- B(n,k,p) - tail of binomial distribution with parameters n,k and p:
@f[
B(n,k,p) = \sum_{j=k}^n
\left(\begin{array}{c}n\\j\end{array}\right)
p^{j} (1-p)^{n-j}
@f]
The value -log10(NFA) is equivalent but more intuitive than NFA:
- -1 corresponds to 10 mean false alarms
- 0 corresponds to 1 mean false alarm
- 1 corresponds to 0.1 mean false alarms
- 2 corresponds to 0.01 mean false alarms
- ...
Used this way, the bigger the value, better the detection,
and a logarithmic scale is used.
@param n,k,p binomial parameters.
@param logNT logarithm of Number of Tests
The computation is based in the gamma function by the following
relation:
@f[
\left(\begin{array}{c}n\\k\end{array}\right)
= \frac{ \Gamma(n+1) }{ \Gamma(k+1) \cdot \Gamma(n-k+1) }.
@f]
We use efficient algorithms to compute the logarithm of
the gamma function.
To make the computation faster, not all the sum is computed, part
of the terms are neglected based on a bound to the error obtained
(an error of 10% in the result is accepted).
*/
static double NFA(int n, int k, double p, double logNT)
{
static double inv[TABSIZE]; /* table to keep computed inverse values */
double tolerance = 0.1; /* an error of 10% in the result is accepted */
double log1term,term,bin_term,mult_term,bin_tail,err,p_term;
int i;
if (p<=0)
p=0.000000000000000000000000000001;
if (p>=1)
p=0.999999999999999999999999999999;
/* check parameters */
if( n<0 || k<0 || k>n || p<=0.0 || p>=1.0 ) {
//fprintf(stderr,"nfa: wrong n, k or p values. (%d , %d , %f)",n,k,p);
//exit(-1);
}
/* trivial cases */
if( n==0 || k==0 ) return -logNT;
if( n==k ) return -logNT - (double) n * log10(p);
/* probability term */
p_term = p / (1.0-p);
/* compute the first term of the series */
/*
binomial_tail(n,k,p) = sum_{i=k}^n bincoef(n,i) * p^i * (1-p)^{n-i}
where bincoef(n,i) are the binomial coefficients.
But
bincoef(n,k) = gamma(n+1) / ( gamma(k+1) * gamma(n-k+1) ).
We use this to compute the first term. Actually the log of it.
*/
log1term = log_gamma( (double) n + 1.0 ) - log_gamma( (double) k + 1.0 )
- log_gamma( (double) (n-k) + 1.0 )
+ (double) k * log(p) + (double) (n-k) * log(1.0-p);
term = exp(log1term);
/* in some cases no more computations are needed */
if( double_equal(term,0.0) ) /* the first term is almost zero */
{
if( (double) k > (double) n * p ) /* at begin or end of the tail? */
return -log1term / M_LN10 - logNT; /* end: use just the first term */
else
return -logNT; /* begin: the tail is roughly 1 */
}
/* compute more terms if needed */
bin_tail = term;
for(i=k+1;i<=n;i++)
{
/*
As
term_i = bincoef(n,i) * p^i * (1-p)^(n-i)
and
bincoef(n,i)/bincoef(n,i-1) = n-1+1 / i,
then,
term_i / term_i-1 = (n-i+1)/i * p/(1-p)
and
term_i = term_i-1 * (n-i+1)/i * p/(1-p).
1/i is stored in a table as they are computed,
because divisions are expensive.
p/(1-p) is computed only once and stored in 'p_term'.
*/
bin_term = (double) (n-i+1) * ( i<TABSIZE ?
( inv[i]!=0.0 ? inv[i] : ( inv[i] = 1.0 / (double) i ) ) :
1.0 / (double) i );
mult_term = bin_term * p_term;
term *= mult_term;
bin_tail += term;
if(bin_term<1.0)
{
/* When bin_term<1 then mult_term_j<mult_term_i for j>i.
Then, the error on the binomial tail when truncated at
the i term can be bounded by a geometric series of form
term_i * sum mult_term_i^j. */
err = term * ( ( 1.0 - pow( mult_term, (double) (n-i+1) ) ) /
(1.0-mult_term) - 1.0 );
/* One wants an error at most of tolerance*final_result, or:
tolerance * abs(-log10(bin_tail)-logNT).
Now, the error that can be accepted on bin_tail is
given by tolerance*final_result divided by the derivative
of -log10(x) when x=bin_tail. that is:
tolerance * abs(-log10(bin_tail)-logNT) / (1/bin_tail)
Finally, we truncate the tail if the error is less than:
tolerance * abs(-log10(bin_tail)-logNT) * bin_tail */
if( err < tolerance * fabs(-log10(bin_tail)-logNT) * bin_tail ) break;
}
}
return -log10(bin_tail) - logNT;
}