-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsesame.c
3851 lines (3331 loc) · 109 KB
/
sesame.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// REFERENCES:
// [1] Calibrating seed-based heuristics to map short DNA reads
// https://doi.org/10.1101/619155
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <sys/types.h>
// SECTION 1. MACROS //
#define LIBNAME "sesame"
#define VERSION "0.9 05-12-2019"
// Overwrite at compile time for other values.
#ifndef DEFAULT_EPSILON
// Tolerance on the estimates (execpt MCMC).
#define DEFAULT_EPSILON 0.01
#endif
#ifndef DEFAULT_SAMPLINGS
// Number of MCMC samples.
#define DEFAULT_SAMPLINGS 1000000
#endif
#ifndef HSIZE
// Number of MCMC samples.
#define HSIZE 4096
#endif
#define SUCCESS 1
#define FAILURE 0
// Modulo operation that returns a positive number.
#define modulo(x, y) ((((int)x) % ((int)y) + ((int)y)) % ((int)y))
// Check if a polynomial is null.
#define iszero(poly) \
((poly) == NULL || ((poly)->monodeg == 0 && (poly)->coeff[0] == 0))
// NOTE 1.1. Error handling. //
//
// All the functions that may fail at runtime -- mostly the function that
// directly or indirectly call 'malloc()' -- have a 'in_case_of_failure'
// section after the final 'return' statement. This section contains the
// instructions for handling errors and exceptions without crashing. In
// general, this means freeing the allocated memory and returning a
// special value that will inform the caller that something went wrong,
// so that the error can be propagated.
//
// The macro below simplifies error handling for memory errors. See
// function 'warning()' defined in section 4.1.
#define handle_memory_error(x) \
do { \
if ((x) == NULL) { \
warning("memory_error", __func__, __LINE__); \
goto in_case_of_failure; \
} \
} while (0)
// SECTION 2. DECLARATIONS //
// SECTION 2.1 TYPE DECLARATIONS //
// NOTE 2.1.1 'trunc_pol_t' and monomials //
//
// The most important type declared here is the truncated polynomial
// 'trunc_pol_t'. It consists of a 'monodeg' member (of type 'size_t')
// and a 'coeff' member (variable-size array of type 'double').
//
// The member 'monodeg' encodes whether the polynomial is a monomial and
// if so, what its degree is. This is important to gain speed while
// multiplying truncated polynomials. If 'monodeg' is a number from 0 to
// 'K' (the maximum degree of every truncated polynomial), the polynomial
// is a monomial of degree 'monodeg'. If the value is greater than 'K',
// the polynomial is not a monomial.
//
// The member 'coeff' contains the coefficients of the truncated
// polynomial. The encoding is natural in the sense that 'coeff[i]' is
// the coefficient of degree 'i'. If the polynomial is a monomial (i.e.
// 'monodeg' is less than or equal to 'K'), then only the entry
// 'coeff[monodeg]' may be non-zero. If this is not the case (which
// should not happen for the sake of consistency), the terms of degree
// different from 'monodeg' are ignored.
//
// It will not cause problems to have a truncated polynomial with only
// one non-zero coefficient and where 'monodeg' is set to a value higher
// than 'K'. The multiplications are somewhat slower, but the results
// will be consistent.
//
// The product of two monomials is a monomial. The addition of two
// monomials is a monomial only if the monomials have the same degree.
//
// The null struct of type 'trunc_pol_t' (i.e. with all members set to
// 0) is a monomial of degree 0 whose coefficient is 0 (it is thus a
// proper definition of the null polynomial). For addition and
// multiplication of polynomials, a pointer of 'trunc_pol_t' that is set
// to 'NULL' is considered to be a pointer to the null polynomial.
typedef struct matrix_t matrix_t; // Transfer matrices.
typedef struct rec_t rec_t; // Hash table records.
typedef struct trunc_pol_t trunc_pol_t; // Truncated polynomials.
struct rec_t {
int n; // Skipping.
double u; // Divergence rate.
int N; // Number of duplicates (log2).
double* prob; // False positive probabilities.
rec_t* next; // Next record if any.
};
struct matrix_t {
const int dim; // Column / row number.
trunc_pol_t* term[]; // Terms of the matrix.
};
struct trunc_pol_t {
int degree; // Degree of the polynomial.
int monodeg; // Monomial (see NOTE 2.1.1).
double coeff[]; // Terms of the polynomial.
};
// SECTION 2.2 FUNCTION DECLARATIONS //
// Mersenne twister functions (see license below) //
void seedMT(unsigned long);
double runifMT(void);
// Function from the R code (see license below) //
int rbinom(int, double);
// SECTION 3. GLOBALS //
// Precision options.
static double EPSILON = DEFAULT_EPSILON;
static long int MCMC_SAMPLINGS = DEFAULT_SAMPLINGS;
// NOTE 3.1 Static parameters. //
//
// The static parameters of the seeding problem are 'G' (a.k.a gamma, the
// minimum seed size), 'K' (the maximum degree of all truncated
// polynomial, standing for the longest possible sequencing read) and 'P'
// (the probability of a sequencing error). They define the properties of
// the sequencing instrument and the choice of a minimum size by the user.
// The other parameters (actual read size, number of duplicates and rate
// of divergence) can depend on the read under consideration and are
// called "dynamic". Static parameters are set by initializing the library
// with 'sesame_set_static_params()'.
static int G = 0; // Minimum size of MEM seeds.
static int K = 0; // Max degree of 'trunc_pol_t' (read size).
static double P = 0.0; // Probability of a read error.
static int KSZ = 0; // Memory size of the 'trunc_pol_t' struct.
// Define 6 generic hash tables to store the probabilities. Hashes 'H'
// are private; hash 'Y1' is public.
static rec_t* H1N[HSIZE] = {0}; // 'auto_exact_seed_nullp()'
static rec_t* H1O[HSIZE] = {0}; // 'auto_exact_seed_offp()'
static rec_t* H2N[HSIZE] = {0}; // 'auto_skip_seed_nullp()'
static rec_t* H2O[HSIZE] = {0}; // 'auto_skip_seed_offp()'
static rec_t* H3N[HSIZE] = {0}; // 'auto_mem_seed_nullp()'
static rec_t* H3O[HSIZE] = {0}; // 'auto_mem_seed_offp()'
static rec_t* Y1[HSIZE] = {0}; // generic "prob_store" hash
static trunc_pol_t* TEMP = NULL; // For matrix multipliciation.
static double* XI; // Precomputed values of 'xi'.
static double* XIc; // Precomputed values of '1-xi'.
static double* ETA; // Precomputed values of 'eta'.
static double* ETAc; // Precomputed values of '1-eta'.
static int PARAMS_INITIALIZED = 0; // Parameters bookkeeping
// NOTE 3.2 Internal errors. //
//
// The following should never appear. However, if the code is run in an
// environment that is very different from the ones where it was tested,
// unpredictable things can happen and we want to give the users a handle
// to contact us so that we can fix the problem.
const char internal_error[] =
"internal error (please contact [email protected])";
// SECTION 4. FUNCTION DEFINITIONS //
// SECTION 4.1. GET AND SET FUNCTIONS (VISIBLE) //
void
sesame_set_epsilon(double p) {
EPSILON = p < 0 ? 0.0 : p;
}
void
sesame_set_mcmcsamplings(long int n) {
MCMC_SAMPLINGS = n;
}
double*
to_array_of_double(trunc_pol_t* x)
// Transforms a 'trunc_pol_t' to an array of double.
{
return (double*)memmove(x, x->coeff, (K + 1) * sizeof(double));
}
// SECTION 4.2. ERROR HANDLING FUNCTIONS //
void
warning( // PRIVATE
const char* msg, // Message to display
const char* function, // Calling function
const int line // Line in the function
)
// SYNOPSIS:
// Print warning message to stderr. This function is used to report
// errors using the error macros (see macros defined in section 1 and
// see NOTE 1.1 about error handling).
{
fprintf(stderr, "[%s] error in function `%s' (line %d): %s\n", LIBNAME,
function, line, msg);
}
// SECTION 4.3. MATHEMATICAL FUNCTIONS //
double
omega( // PRIVATE
const int m, // See equation (10)
const double u, // Sequence divergence
const int N // Number of duplicates
)
// Convenience function described in equation (10) of reference [1].
{
if (m > N) {
return 0.0;
}
double log_N_choose_m =
lgamma(N + 1) - lgamma(m + 1) - lgamma(N - m + 1);
return exp(log_N_choose_m + (N - m) * log(1 - u / 3) + m * log(u / 3));
}
double
xi( // PRIVATE
const int j, // See equation (13)
const double u // Sequence divergence
)
// Convenience function described in equation (13) of reference [1].
{
return 1 - pow(1 - u, j);
}
double
psi( // PRIVATE
const int j, // See equation (19)
const int m, // See equation (19)
const int n, // See equation (19)
const int r, // See equation (19)
const double u, // Sequence divergence
const int N // Number of duplicates
)
// Convenience function described in equation (19) of reference [1].
{
if (N < (m + r))
return 0.0;
// Take out the terms of the sum that do not depend on 'q'.
const double lC = lgamma(m + 1) + lgamma(N - m - r + 1);
const double lx = log(xi(j, u));
double val = 0.0;
int topq = n - r < m ? n - r : m;
for (int q = 0; q <= topq; q++) {
val += exp((n - r - q) * lx - lgamma(q + 1) - lgamma(m - q + 1) -
lgamma(n - r - q + 1) - lgamma(N - m - n + q + 1) + lC);
}
return val;
}
double
zeta( // PRIVATE
const int j, // See equation (19)
const int m, // See equation (19)
const int n, // See equation (19)
const double u, // Sequence divergence
const int N // Number of duplicates
)
// Convenience function described in equation (19) of reference [1].
{
if (N < m)
return 0.0;
// Take out the terms of the sum that do not depend on 'r'.
const double lC = lgamma(N - m + 1) + lgamma(n + 1) + lgamma(N - n + 1) -
lgamma(N + 1);
const double lu = j * log(1 - u);
double val = 0.0;
int topr = N - m < n ? N - m : n;
for (int r = 1; r <= topr; r++) {
val += psi(j, m, n, r, u, N) *
exp(r * lu - lgamma(r + 1) - lgamma(N - m - r + 1) + lC);
}
return val;
}
// SECTION 4.4. INITIALIZATION, BOOKKEEPING AND CLEAN UP FUNCTIONS //
// SECTION 4.4.1. DESTRUCTORS //
void
destroy_mat( // PRIVATE
matrix_t* mat // Matrix to destroy
)
// SYNOPSIS:
// Free the memory for all entries of the matrix passed as argument.
{
// Do nothing if 'mat' is the NULL pointer.
if (mat == NULL)
return;
size_t nterms = (mat->dim) * (mat->dim);
for (size_t i = 0; i < nterms; i++)
free(mat->term[i]);
free(mat);
}
void
clean_hash( // PRIVATE
rec_t** hash // Table to empty
)
// SYNOPSIS:
// Free the memory for all entries of the hash.
{
for (int i = 0; i < HSIZE; i++) {
for (rec_t* rec = hash[i]; rec != NULL;) {
rec_t* next = rec->next;
free(rec->prob);
free(rec);
rec = next;
}
hash[i] = NULL;
}
}
// SECTION 4.4.2. CONSTRUCTORS //
trunc_pol_t*
new_zero_trunc_pol( // PRIVATE
void // No argument
)
// SYNOPSIS:
// Allocate memory for a new struct of type 'trunc_pol_t', initialize
// it to 0 (i.e. null polynomial).
//
// RETURN:
// A pointer to the new struct of type 'trunc_pol_t' or 'NULL' in case
// of failure.
//
// FAILURE:
// Fails if parameters are not initialized (if 'K' is unknown, we
// cannot call 'malloc()' with the proper size) or if there is a
// memory error.
{
// Cannot be called before 'sesame_set_static_params()'.
if (!PARAMS_INITIALIZED) {
warning("parameters unset: call `sesame_set_static_params'", __func__,
__LINE__);
goto in_case_of_failure;
}
trunc_pol_t* new = calloc(1, KSZ);
handle_memory_error(new);
return new;
in_case_of_failure:
return NULL;
}
matrix_t*
new_null_matrix( // PRIVATE
const int dim // Number of rows / columns
)
// SYNOPSIS:
// Allocate memory for a new struct of type 'matrix_t', initialize all
// entries to 0 (i.e. all are NULL pointers). Since the entries of
// matrix are pointers to 'trunc_pol_t', each of them corresponds to a
// null polynomial. Still, the entries are not writable because no
// memory has been allocatd for them (see 'new_zero_matrix()').
//
// RETURN:
// A pointer to the new struct of type 'matrix_t' or 'NULL' in case
// of failure.
//
// FAILURE:
// Fails if there is a memory error.
{
// Initialize to zero.
size_t sz = sizeof(matrix_t) + dim * dim * sizeof(trunc_pol_t*);
matrix_t* new = calloc(1, sz);
handle_memory_error(new);
// The dimension is set upon creation and never changes afterwards.
// In the declaration of the struct, the member 'dim' is a constant,
// the syntax below is way to go around the 'const' qualifier.
*(int*)&new->dim = dim;
return new;
in_case_of_failure:
return NULL;
}
matrix_t*
new_zero_matrix( // PRIVATE
const int dim // Number of rows / columns
)
// SYNOPSIS:
// Allocate memory for a new struct of type 'matrix_t', initialize all
// entries to different null polynomials (i.e. all are non NULL
// pointers to struct of type 'trunc_pol_t' with all their coefficients
// set to 0). Unlike 'new_null_matrix()', the entries are writable.
//
// RETURN:
// A pointer to the new struct of type 'matrix_t' or 'NULL' in case
// of failure.
//
// FAILURE:
// Fails if parameters are not initialized (if 'K' is unknown, we
// cannot call 'malloc()' with the proper size for the truncated
// polynomials) or if there is a memory error. Checking that parameters
// are set is done indirectly in the call to 'new_zero_trunc_pol()'.
{
matrix_t* new = new_null_matrix(dim);
handle_memory_error(new);
for (int i = 0; i < dim * dim; i++) {
handle_memory_error(new->term[i] = new_zero_trunc_pol());
}
return new;
in_case_of_failure:
destroy_mat(new);
return NULL;
}
// SECTION 4.4.3 HASH MANIPULATION FUNCTIONS //
int
squish( // PRIVATE
int N // Number of duplicates
)
// SYNOPSIS:
// Assign 'N' to predefined buckets.
//
// RETURN:
// The reference value for 'N'.
{
// NOTE 4.4.3.1. Shared buckets for the values of N. //
//
// The values are approximately the mid-points of consecutive powers of
// two. 0 has its own bucket, 1, 2 and 3 are in the same bucket, so are
// 3, 4, 5, 6, 7 8 and 9 etc. The normalized values of 'N' are the
// mid-points of the intervals.
const int coarse_values[] = {0, 1, 4, 10, 22,
46, 94, 190, 382, 766,
1534, 3070, 6142, 12286, 24574};
int lgN = N < 16383 ? floor(log2(N + 1)) : 14;
return coarse_values[lgN];
}
rec_t*
lookup( // PRIVATE
rec_t** hash, // Table to perform the lookup in
const int n, // Amount of skipping
const double u, // Sequence divergence
const int N // Number of duplicates
)
// SYNOPSIS:
// Look for the entry of 'hash', associated with the key (n, u, N)
// where 'n' can be thought of the amount of skipping (use 0 for
// exact or MEM seeds), 'u' can be thought of the sequence divergence
// "coarse-grained" to 0.01 and 'N' can be thought of the number of
// duplicates.
//
// RETURN:
// A pointer to the struct of type 'rec_t' that corresponds to the key
// (or its coarse-grained variants) if present, or NULL otherwise.
{
size_t coarse_u = (100 * u);
size_t addr = (739 * n + 37 * N + coarse_u) % HSIZE;
for (rec_t* rec = hash[addr]; rec != NULL; rec = rec->next) {
if (rec->n == n && rec->u == coarse_u && rec->N == N)
return rec;
}
// Entry not found.
return NULL;
}
rec_t*
insert( // PRIVATE
rec_t** hash, // Table to insert entry into
const int n, // Amount of skipping
const double u, // Sequence divergence
const int N, // Number of duplicates
double* prob // Array of probabilities
)
// SYNOPSIS:
// Create an entry in 'hash', with value 'prob', associated with the
// key (n, u, N) where 'n' can be thought of the amount of skipping
// (use 0 for exact or MEM seeds), 'u' can be thought of the sequence
// divergence "coarse-grained" to 0.01 and 'N' can be thought of the
// number of duplicates.
//
// RETURN:
// A pointer to the struct of type 'rec_t' that corresponds to the
// inserted key, or NULL in case of failure.
//
// FAILURE:
// Fails if 'malloc()' fails.
{
// See if entry was already created.
rec_t *rec = lookup(hash, n, u, N);
if (rec != NULL) {
// WARNING: this may cause memory leaks.
rec->prob = prob;
return rec;
}
size_t coarse_u = (100 * u);
size_t addr = (739 * n + 37 * N + coarse_u) % HSIZE;
rec_t* new = malloc(sizeof(rec_t));
handle_memory_error(new);
new->n = n;
new->u = coarse_u;
new->N = N;
new->prob = prob;
new->next = hash[addr];
hash[addr] = new;
return new;
in_case_of_failure:
return NULL;
}
double*
fetch_prob( // PUBLIC
const int n, // Amount of skipping
const double u, // Sequence divergence
const int N // Number of duplicates
)
// SYNOPSIS:
// Look for the entry of the hash associated with the key (N,u).
// 'u' is "coarse-grained" to 0.01.
//
// RETURN:
// An array of numbers that corresponds to the key (or its
// coarse-grained variants) if present, or NULL otherwise.
{
rec_t* record = lookup(Y1, n, u, N);
return record == NULL ? NULL : record->prob;
}
int
store_prob( // PUBLIC
const int n, // Amount of skipping
const double u, // Sequence divergence
const int N, // Number of duplicates
double* prob // Array of probabilities
)
// SYNOPSIS:
// Create an entry in the hash associated with the key
// 'u' is "coarse-grained" to 0.01.
//
// RETURN:
// 'SUCCESS' (1) in case of success, 'FAILURE' (0) in case of
// failure.
//
// FAILURE:
// Fails if 'insert()' fails.
{
rec_t* record = insert(Y1, n, u, N, prob);
return record == NULL ? FAILURE : SUCCESS;
}
// SECTION 4.4.4. LIBRARY INITIALIATION AND CLEAN UP //
int
dynamic_params_OK( // PRIVATE
const int k, // Segment or read size
const double u, // Sequence divergence
const int N // Number of duplicates
)
// SYNOPSIS:
// Check if dynamic parameters are valid.
//
// RETURN:
// SUCCESS (1) upon success or FAILURE (0) upon failure.
//
// FAILURE:
// Fails if dynamic parameters do not conform specifications or if
// static parameters are not initialized.
{
// Check if sequencing parameters were set. Otherwise
// warn user and fail gracefully (return nan).
if (!PARAMS_INITIALIZED) {
warning("parameters unset: call `sesame_set_static_params'", __func__,
__LINE__);
return FAILURE;
}
if (k > K) {
char msg[128];
snprintf(msg, 128, "parameter k (%d) greater than set value (%d)", k,
K);
warning(msg, __func__, __LINE__);
return FAILURE;
}
if (k < 1) {
warning("parameter k must be positive", __func__, __LINE__);
return FAILURE;
}
if (u <= 0.0 || u >= 1.0) {
warning("parameter u must be between 0 and 1", __func__, __LINE__);
return FAILURE;
}
if (N < 0) {
warning("parameter N must be nonnegative", __func__, __LINE__);
return FAILURE;
}
return SUCCESS;
}
void
sesame_clean( // PUBLIC
void // No argument
)
// SYNOPSIS:
// Rest global parameters to their uninitialized state and free
// allocated memory. The function must be called after using the
// library to prevent memory leaks.
//
// SIDE-EFFETS:
// Change the values of the global constants 'G', 'K', 'P', 'KSZ'
// and 'PARAMS_INITIALIZED'. Free global variables 'TEMP', 'XI',
// 'XIc', 'ETA' and 'ETAc'. Destroy global hashes.
{
PARAMS_INITIALIZED = 0;
// Set global variables to "fail" values.
G = 0;
K = 0;
P = 0.0;
KSZ = 0;
PARAMS_INITIALIZED = 0;
// Free everything.
free(TEMP);
TEMP = NULL;
free(XI);
XI = NULL;
free(XIc);
XIc = NULL;
free(ETA);
ETA = NULL;
free(ETAc);
ETAc = NULL;
// Clean hash tables.
clean_hash(H1N);
clean_hash(H1O);
clean_hash(H2N);
clean_hash(H2O);
clean_hash(H3N);
clean_hash(H3O);
clean_hash(Y1);
return;
}
int
sesame_set_static_params( // PRIVATE
int g, // Minimum seed size
int k, // Segment or read size
double p // Error rate
)
// SYNOPSIS:
// Initialize static parameters from user-defined values.
//
// RETURN:
// SUCCESS (1) upon success or FAILURE (0) upon failure.
//
// SIDE-EFFETS:
// Change the values of the global constants 'G', 'K', 'P', 'KSZ'
// and 'PARAMS_INITIALIZED'. Destroy global hashes. Reset the
// global 'TEMP'.
//
// FAILURE:
// Fails if static parameters do not conform specifications or if
// 'malloc()' fails (the truncated polynomial 'TEMP' must be
// allocated).
{
// Check input
if (g <= 0 || k <= 0) {
warning("parameters g and k must greater than 0", __func__, __LINE__);
goto in_case_of_failure;
}
if (p <= 0.0 || p >= 1.0) {
warning("parameter p must be between 0 and 1", __func__, __LINE__);
goto in_case_of_failure;
}
G = g; // MEM size.
K = k; // Read size.
P = p; // Sequencing error.
// All 'trunc_pol_t' must be congruent.
KSZ = sizeof(trunc_pol_t) + (K + 1) * sizeof(double);
PARAMS_INITIALIZED = 1;
// Clean previous values (if any).
clean_hash(H1N);
clean_hash(H1O);
clean_hash(H2N);
clean_hash(H2O);
clean_hash(H3N);
clean_hash(H3O);
clean_hash(Y1);
free(TEMP); // Nothing happens if 'TEMP' is NULL.
handle_memory_error(TEMP = new_zero_trunc_pol());
return SUCCESS;
in_case_of_failure:
sesame_clean();
return FAILURE;
}
// SECTION 4.5. WEIGHTED GENERATING FUNCTIONS //
// NOTE 4.5.1. Internal errors in weighted generating functions //
//
// The functions below do not have detailed synopsis because they all
// follow the same pattern of allocating a particular polynomial defined
// in the theory of computing MEM seeding probabilities by the symbolic
// method. Some functions check the value of the input paramters and can
// trigger an "internal error" (see NOTE 3.2), whereas others do not. The
// logic is that the polynomials can be mathematically undefined (when
// the expression involves a division by zero for instance) or can be not
// used in the present theory. The former case triggers an internal
// error, whereas the second does not. This is to allow anyone to reuse
// the code for their own purpose, without "hurting themselves" by
// triggering undefined, unpredictable or unwanted behaviors. Calling the
// visible functions of this library should never trigger an internal
// error.
trunc_pol_t*
new_trunc_pol_A( // PRIVATE
const size_t m, // See equation (20)
const size_t n, // See equation (20)
const double u, // Divergence rate
const int N // Number of duplicates
)
// Convenience function described in equation (20) of reference [1].
{
// REMINDER: 'u' must be in (0,1), we assume the caller checked.
trunc_pol_t* new = new_zero_trunc_pol();
handle_memory_error(new);
// When N is 0, these polynomials are non-null
// only if 'm' and 'n' are zero.
if (N == 0) {
if (m == 0 && n == 0) {
new->degree = 1;
new->monodeg = 1;
new->coeff[1] = P;
}
return new;
}
// This is not a monomial.
new->degree = K;
new->monodeg = K + 1;
double omega_p_pow_of_q = omega(n, u, N) * P;
for (int i = 1; i <= G; i++) {
new->coeff[i] = (1 - pow(xi(i - 1, u), N)) * omega_p_pow_of_q;
omega_p_pow_of_q *= (1.0 - P);
}
for (int i = G + 1; i <= K; i++) {
new->coeff[i] =
(1 - pow(xi(i - 1, u), m) * (1 - zeta(i - 1, m, n, u, N))) *
omega_p_pow_of_q;
omega_p_pow_of_q *= (1.0 - P);
}
return new;
in_case_of_failure:
return NULL;
}
trunc_pol_t*
new_trunc_pol_B( // PRIVATE
const size_t i, // See equation (14)
const double u, // Divergence rate
const int N // Number of duplicates
)
// Convenience function described in equation (14) of reference [1].
{
// REMINDER: 'u' must be in (0,1), we assume the caller checked.
if (i < 1) {
warning(internal_error, __func__, __LINE__);
goto in_case_of_failure;
}
trunc_pol_t* new = new_zero_trunc_pol();
handle_memory_error(new);
if (N == 0) {
// Monomial of degree 1 if i is 1, zero polynomial otherwise.
if (i == 1) {
new->degree = 1;
new->monodeg = 1;
new->coeff[1] = 1 - P;
}
return new;
}
// This is a monomial.
new->degree = i;
new->monodeg = i;
new->coeff[i] =
(pow(xi(i, u), N) - pow(xi(i - 1, u), N)) * pow(1 - P, i);
return new;
in_case_of_failure:
return NULL;
}
trunc_pol_t*
new_trunc_pol_C( // PRIVATE
const size_t m, // See equation (15)
const double u, // Divergence rate
const int N // Number of duplicates
)
// Convenience function described in equation (15) of reference [1].
// Note: the polynomials are defined in the case m > N, but
// they have no meaning in the present context. Here they are
// simply not forbidden, but also not used (and not tested).
{
// REMINDER: 'u' must be in (0,1), we assume the caller checked.
trunc_pol_t* new = new_zero_trunc_pol();
handle_memory_error(new);
// Special case that 'N' is 0. Assuming that 'm' is 0 (see
// remark above), this is a monomial equal to 1.
if (N == 0) {
new->degree = 0;
new->monodeg = 0;
new->coeff[0] = 1.0;
return new;
}
// This is not a monomial (except in the case that
// 'G' is 1 and 'm' is 1, but this is too rare to justify
// the extra code (it won't trigger a mistake anyway).
new->degree = K;
new->monodeg = K + 1;
double pow_of_q = 1.0;
for (int i = 0; i <= G - 1; i++) {
new->coeff[i] = (1 - pow(xi(i, u), N)) * pow_of_q;
pow_of_q *= (1.0 - P);
}
for (int i = G; i <= K; i++) {
new->coeff[i] = (1 - pow(xi(i, u), m)) * pow_of_q;
pow_of_q *= (1.0 - P);
}
return new;
in_case_of_failure:
return NULL;
}
trunc_pol_t*
new_trunc_pol_D( // PRIVATE
const int j, // See equation (11)
const int m, // See equation (11)
const double u, // Divergence rate
const int N // Number of duplicates
)
// Convenience function described in equation (11) of reference [1].
{
// REMINDER: 'u' must be in (0,1), we assume the caller checked.
if (j > G - 1) {
warning(internal_error, __func__, __LINE__);
goto in_case_of_failure;
}
trunc_pol_t* new = new_zero_trunc_pol();
handle_memory_error(new);
// In the case that 'N' is 0, the only polynomial that is
// defined is D(1,0,0) -- the others are set to 0. In the
// case that 'm' is greater than 'N' the polynomial is 0.
if ((N == 0 && !(j == 1 && m == 0)) || m > N) {
return new;
}
// This is a monomial only if j is equal to G-1.
new->monodeg = j == G - 1 ? 1 : K + 1;
new->degree = G - j;
double omega_p_pow_of_q = omega(m, u, N) * P;
for (int i = 1; i <= G - j; i++) {
new->coeff[i] = omega_p_pow_of_q;
omega_p_pow_of_q *= (1.0 - P);
}
return new;
in_case_of_failure:
return NULL;
}
trunc_pol_t*
new_trunc_pol_E( // PRIVATE
const int j // See equation (12)
)
// Convenience function described in equation (12) of reference [1].
{
if (j > G - 1) {
warning(internal_error, __func__, __LINE__);
goto in_case_of_failure;
}
trunc_pol_t* new = new_zero_trunc_pol();
handle_memory_error(new);
// This is a monomial only if j is equal to G-1
new->monodeg = j == G - 1 ? 0 : K + 1;
new->degree = G - j - 1;
double pow_of_q = 1.0;
for (int i = 0; i <= G - j - 1; i++) {
new->coeff[i] = pow_of_q;
pow_of_q *= (1.0 - P);
}
return new;
in_case_of_failure:
return NULL;
}
trunc_pol_t*
new_trunc_pol_F( // PRIVATE
const int j, // See definition on page 18
const double u // Divergence rate
)
// Convenience function described on page 18 of reference [1].
{
// REMINDER: 'u' must be in (0,1), we assume the caller checked.
if (j > G - 1) {
warning(internal_error, __func__, __LINE__);
goto in_case_of_failure;
}
trunc_pol_t* new = new_zero_trunc_pol();
handle_memory_error(new);
// This is a monomial only when 'j' is 0.
new->monodeg = j == 0 ? 0 : K + 1;
new->degree = j;
const double a = (1 - P) * (1 - u);