-
Notifications
You must be signed in to change notification settings - Fork 8
/
FastTree.c
10304 lines (9484 loc) · 386 KB
/
FastTree.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* FastTree -- inferring approximately-maximum-likelihood trees for large
* multiple sequence alignments.
*
* Morgan N. Price
* http://www.microbesonline.org/fasttree/
*
* Thanks to Jim Hester of the Cleveland Clinic Foundation for
* providing the first parallel (OpenMP) code, Siavash Mirarab of
* UT Austin for implementing the WAG option, Samuel Shepard
* at the CDC for suggesting and helping with the -quote option, and
* Aaron Darling (University of Technology, Sydney) for numerical changes
* for wide alignments of closely-related sequences.
*
* Copyright (C) 2008-2015 The Regents of the University of California
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
* or visit http://www.gnu.org/copyleft/gpl.html
*
* Disclaimer
*
* NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY,
* NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED,
* OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
* COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
* OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE
* PRIVATELY OWNED RIGHTS.
*/
/*
* To compile FastTree, do:
* gcc -Wall -O3 -finline-functions -funroll-loops -o FastTree -lm FastTree.c
* Use -DNO_SSE to turn off use of SSE3 instructions
* (should not be necessary because compiler should not set __SSE__ if
* not available, and modern mallocs should return 16-byte-aligned values)
* Use -DOPENMP -fopenmp to use multiple threads (note, old versions of gcc
* may not support -fopenmp)
* Use -DTRACK_MEMORY if you want detailed reports of memory usage,
* but results are not correct above 4GB because mallinfo stores int values.
* It also makes FastTree run significantly slower.
*
* To get usage guidance, do:
* FastTree -help
*
* FastTree uses profiles instead of a distance matrix, and computes
* support values for each split from the profiles of the 4 nodes
* around the split. It stores a profile for each node and a average
* profile over all active nodes (the "out-profile" for computing the
* total sum of distance to other nodes). The neighbor joining phase
* requires O(N*L*a) space, where N is the number of sequences, L is
* the alignment width, and a is the alphabet size. The top-hits
* heuristic requires an additional O(N sqrt(N)) memory. After
* neighbor-joining, FastTree improves the topology with
* nearest-neighbor interchanges (NNIs) and subtree-prune-regraft
* moves (SPRs), which does not have a significant additional memory
* requirement. (We need only store "up-profiles" on the path from our
* current traversal point to the root.) These take O(NLa) time per
* round, and with default settings, O(N log(N) L a) time total.
* FastTree further improves the topology with maximum-likelihood
* NNIs, using similar data structures and complexity, but with a
* higher constant factor, and now the "profiles" are actually
* posterior distributions for that subtree. Finally, FastTree
* resamples the site likelihoods around each NNI and uses
* the Shimodaira Hasegawa test to estimate the reliability of each split.
*
* Overview of the neighbor-joining phase:
*
* Although FastTree uses a log correction on profile distances to
* account for multiple substitutions when doing NNIs and SPRs, the
* operations on the profiles themselves involve "additive" distances
* -- either %different (for nucleotide) or by using an amino acid
* similarity matrix (for proteins). If we are using %different as
* our distance matrix then
*
* Profile_distance(A,B) = 1 - sum over characters of freq(A)*freq(B)
*
* and we can average this value over positions. Positions with gaps
* are weighted by %ungapped(A) * %ungapped(B).
*
* If we are using an amino acid dissimilarity matrix D(i,j) then at
* each position
*
* Profile_distance(A,B) = sum(i,j) freq(A==i) * freq(B==j) * D(i,j)
* = sum(k) Ak * Bk * Lambda(k)
*
* where k iterates over 20 eigenvectors, Lambda(k) is the eigenvalue,
* and if A==i, then Ak is the kth column of the inverse of the
* eigenvector matrix.
*
* The exhaustive approach (-slow) takes O(N**3*L*a) time, but
* this can be reduced to as little as O(N**(3/2)*log(N)*L*a) time
* by using heuristics.
*
* It uses a combination of three heuristics: a visible set similar to
* that of FastTree (Elias & Lagergren 2005), a local hill-climbing
* search for a better join (as in relaxed neighbor-joining, Evans et
* al. 2006), and a top-hit list to reduce the search space (see
* below).
*
* The "visible" set stores, for each node, the best join for that
* node, as identified at some point in the past
*
* If top-hits are not being used, then the neighbor-joining phase can
* be summarized as:
*
* Compute the out-profile by averaging the leaves
* Compute the out-distance of each leaf quickly, using the out-profile
* Compute the visible set (or approximate it using top-hits, see below)
* Until we're down to 3 active nodes:
* Find the best join in the visible set
* (This involves recomputing the neighbor-joining criterion,
* as out-distances and #active nodes may have changed)
* Follow a chain of best hits (again recomputing the criterion)
* until we find a locally best join, as in relaxed neighbor joining
* Create a profile of the parent node, either using simple averages (default)
* or using weighted joining as in BIONJ (if -bionj was specified)
* Update the out-profile and the out-distances
* Update the visible set:
* find the best join for the new joined node
* replace hits to the joined children with hits to the parent
* if we stumble across a join for the new node that is better
* than the corresponding entry in the visible set, "reset"
* that entry.
*
* For each iteration, this method does
* O(N) work to find the best hit in the visible set
* O(L*N*a*log(N)) work to do the local search, where log(N)
* is a pessimistic estimate of the number of iterations. In
* practice, we average <1 iteration for 2,000 sequences.
* With -fastest, this step is omitted.
* O(N*a) work to compute the joined profile and update the out-profile
* O(L*N*a) work to update the out-distances
* O(L*N*a) work to compare the joined profile to the other nodes
* (to find the new entry in the visible set)
*
* and there are N-3 iterations, so it takes O(N**2 * L * log(N) * a) time.
*
* The profile distances give exactly the same result as matrix
* distances in neighbor-joining or BIONJ would if there are no gaps
* in the alignment. If there are gaps, then it is an
* approximation. To get the same result we also store a "diameter"
* for each node (diameter is 0 for leaves).
*
* In the simpler case (NJ rather than BIONJ), when we join A and B to
* give a new node AB,
*
* Profile(AB) = (A+B)/2
* Profile_distance(AB,C) = (Profile_distance(A,C)+Profile_distance(B,C))/2
* because the formulas above are linear
*
* And according to the neighor-joining rule,
* d(AB,C) = (d(A,C)+d(B,C)-d(A,B))/2
*
* and we can achieve the same value by writing
* diameter(AB) = pd(A,B)/2
* diameter(leaf) = 0
* d(A,B) = pd(A,B) - diameter(A) - diameter(B)
*
* because
* d(AB,C) = (d(A,C)+d(B,C)-d(A,B))/2
* = (pd(A,C)-diam(A)-diam(C)+pd(B,C)-diam(B)-diam(C)-d(A,B)+diam(A)+diam(B))/2
* = (pd(A,C)+pd(B,C))/2 - diam(C) - pd(A,B)
* = pd(AB,C) - diam(AB) - diam(C)
*
* If we are using BIONJ, with weight lambda for the join:
* Profile(AB) = lambda*A + (1-lambda)*B
* then a similar argument gives
* diam(AB) = lambda*diam(A) + (1-lambda)*diam(B) + lambda*d(A,AB) + (1-lambda)*d(B,AB),
*
* where, as in neighbor joining,
* d(A,AB) = d(A,B) + (total out_distance(A) - total out_distance(B))/(n-2)
*
* A similar recursion formula works for the "variance" matrix of BIONJ,
* var(AB,C) = lambda*var(A,C) + (1-lambda)*var(B,C) - lambda*(1-lambda)*var(A,B)
* is equivalent to
* var(A,B) = pv(A,B) - vd(A) - vd(B), where
* pv(A,B) = pd(A,B)
* vd(A) = 0 for leaves
* vd(AB) = lambda*vd(A) + (1-lambda)*vd(B) + lambda*(1-lambda)*var(A,B)
*
* The top-hist heuristic to reduce the work below O(N**2*L) stores a top-hit
* list of size m=sqrt(N) for each active node.
*
* The list can be initialized for all the leaves in sub (N**2 * L) time as follows:
* Pick a "seed" sequence and compare it to all others
* Store the top m hits of the seed as its top-hit list
* Take "close" hits of the seed(within the top m, and see the "close" parameter),
* and assume that their top m hits lie within the top 2*m hits of the seed.
* So, compare them to the seed's neighors (if they do not already
* have a top hit list) and set their top hits.
*
* This method does O(N*L) work for each seed, or O(N**(3/2)*L) work total.
*
* To avoid doing O(N*L) work at each iteration, we need to avoid
* updating the visible set and the out-distances. So, we use "stale"
* out-distances, and when searching the visible set for the best hit,
* we only inspect the top m=sqrt(N) entries. We then update those
* out-distances (up to 2*m*L*a work) and then find the best hit.
*
* To avoid searching the entire visible set, FastTree keeps
* and updates a list of the top sqrt(N) entries in the visible set.
* This costs O(sqrt(N)) time per join to find the best entry and to
* update, or (N sqrt(N)) time overall.
*
* Similarly, when doing the local hill-climbing, we avoid O(N*L) work
* by only considering the top-hits for the current node. So this adds
* O(m*a*log(N)) work per iteration.
*
* When we join two nodes, we compute profiles and update the
* out-profile as before. We need to compute the best hits of the node
* -- we merge the lists for the children and select the best up-to-m
* hits. If the top hit list contains a stale node we replace it with
* its parent. If we still have <m/2 entries, we do a "refresh".
*
* In a "refresh", similar to the fast top-hit computation above, we
* compare the "seed", in this case the new joined node, to all other
* nodes. We compare its close neighbors (the top m hits) to all
* neighbors (the top 2*m hits) and update the top-hit lists of all
* neighbors (by merging to give a list of 3*m entries and then
* selecting the best m entries).
*
* Finally, during these processes we update the visible sets for
* other nodes with better hits if we find them, and we set the
* visible entry for the new joined node to the best entry in its
* top-hit list. (And whenever we update a visible entry, we
* do O(sqrt(N)) work to update the top-visible list.)
* These udpates are not common so they do not alter the
* O(N sqrt(N) log(N) L a) total running time for the joining phase.
*
* Second-level top hits
*
* With -fastest or with -2nd, FastTree uses an additional "2nd-level" top hits
* heuristic to reduce the running time for the top-hits phase to
* O(N**1.25 L) and for the neighbor-joining phase to O(N**1.25 L a).
* This also reduces the memory usage for the top-hits lists to
* O(N**1.25), which is important for alignments with a million
* sequences. The key idea is to store just q = sqrt(m) top hits for
* most sequences.
*
* Given the neighbors of A -- either for a seed or for a neighbor
* from the top-hits heuristic, if B is within the top q hits of A, we
* set top-hits(B) from the top 3*q top-hits of A. And, we record that
* A is the "source" of the hits for B, so if we run low on hits for
* B, instead of doing a full refresh, we can do top-hits(B) :=
* top-hits(B) union top-hits(active_ancestor(A)).
* During a refresh, these "2nd-level" top hits are updated just as
* normal, but the source is maintained and only q entries are stored,
* until we near the end of the neighbor joining phase (until the
* root as 2*m children or less).
*
* Parallel execution with OpenMP
*
* If you compile FastTree with OpenMP support, it will take
* advantage of multiple CPUs on one machine. It will parallelize:
*
* The top hits phase
* Comparing one node to many others during the NJ phase (the simplest kind of join)
* The refresh phase
* Optimizing likelihoods for 3 alternate topologies during ML NNIs and ML supports
* (only 3 threads can be used)
*
* This accounts for most of the O(N L a) or slower steps except for
* minimum-evolution NNIs (which are fast anyway), minimum-evolution SPRs,
* selecting per-site rates, and optimizing branch lengths outside of ML NNIs.
*
* Parallelizing the top hits phase may lead to a slight change in the tree,
* as some top hits are computed from different (and potentially less optimal source).
* This means that results on repeated runs may not be 100% identical.
* However, this should not have any significant effect on tree quality
* after the NNIs and SPRs.
*
* The OpenMP code also turns off the star-topology test during ML
* NNIs, which may lead to slight improvements in likelihood.
*/
#include <stdio.h>
#include <stdbool.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include <sys/time.h>
#include <ctype.h>
#include <unistd.h>
#ifdef TRACK_MEMORY
/* malloc.h apparently doesn't exist on MacOS */
#include <malloc.h>
#endif
/* Compile with -DOPENMP to turn on multithreading */
#ifdef OPENMP
#include <omp.h>
#endif
/* By default, tries to compile with SSE instructions for greater speed.
But if compiled with -DUSE_DOUBLE, uses double precision instead of single-precision
floating point (2x memory required), does not use SSE, and allows much shorter
branch lengths.
*/
#ifdef __SSE__
#if !defined(NO_SSE) && !defined(USE_DOUBLE)
#define USE_SSE3
#endif
#endif
#ifdef USE_DOUBLE
#define SSE_STRING "Double precision (No SSE3)"
typedef double numeric_t;
#define ScanNumericSpec "%lf"
#else
typedef float numeric_t;
#define ScanNumericSpec "%f"
#endif
#ifdef USE_SSE3
#define SSE_STRING "SSE3"
#define ALIGNED __attribute__((aligned(16)))
#define IS_ALIGNED(X) ((((unsigned long) new) & 15L) == 0L)
#include <xmmintrin.h>
#else
#define ALIGNED
#define IS_ALIGNED(X) 1
#ifndef USE_DOUBLE
#define SSE_STRING "No SSE3"
#endif
#endif /* USE_SSE3 */
#define FT_VERSION "2.1.11"
char *usage =
" FastTree protein_alignment > tree\n"
" FastTree < protein_alignment > tree\n"
" FastTree -out tree protein_alignment\n"
" FastTree -nt nucleotide_alignment > tree\n"
" FastTree -nt -gtr < nucleotide_alignment > tree\n"
" FastTree < nucleotide_alignment > tree\n"
"FastTree accepts alignments in fasta or phylip interleaved formats\n"
"\n"
"Common options (must be before the alignment file):\n"
" -quiet to suppress reporting information\n"
" -nopr to suppress progress indicator\n"
" -log logfile -- save intermediate trees, settings, and model details\n"
" -fastest -- speed up the neighbor joining phase & reduce memory usage\n"
" (recommended for >50,000 sequences)\n"
" -n <number> to analyze multiple alignments (phylip format only)\n"
" (use for global bootstrap, with seqboot and CompareToBootstrap.pl)\n"
" -nosupport to not compute support values\n"
" -intree newick_file to set the starting tree(s)\n"
" -intree1 newick_file to use this starting tree for all the alignments\n"
" (for faster global bootstrap on huge alignments)\n"
" -pseudo to use pseudocounts (recommended for highly gapped sequences)\n"
" -gtr -- generalized time-reversible model (nucleotide alignments only)\n"
" -lg -- Le-Gascuel 2008 model (amino acid alignments only)\n"
" -wag -- Whelan-And-Goldman 2001 model (amino acid alignments only)\n"
" -quote -- allow spaces and other restricted characters (but not ' ) in\n"
" sequence names and quote names in the output tree (fasta input only;\n"
" FastTree will not be able to read these trees back in)\n"
" -noml to turn off maximum-likelihood\n"
" -nome to turn off minimum-evolution NNIs and SPRs\n"
" (recommended if running additional ML NNIs with -intree)\n"
" -nome -mllen with -intree to optimize branch lengths for a fixed topology\n"
" -cat # to specify the number of rate categories of sites (default 20)\n"
" or -nocat to use constant rates\n"
" -gamma -- after optimizing the tree under the CAT approximation,\n"
" rescale the lengths to optimize the Gamma20 likelihood\n"
" -constraints constraintAlignment to constrain the topology search\n"
" constraintAlignment should have 1s or 0s to indicates splits\n"
" -expert -- see more options\n"
"For more information, see http://www.microbesonline.org/fasttree/\n";
char *expertUsage =
"FastTree [-nt] [-n 100] [-quote] [-pseudo | -pseudo 1.0]\n"
" [-boot 1000 | -nosupport]\n"
" [-intree starting_trees_file | -intree1 starting_tree_file]\n"
" [-quiet | -nopr]\n"
" [-nni 10] [-spr 2] [-noml | -mllen | -mlnni 10]\n"
" [-mlacc 2] [-cat 20 | -nocat] [-gamma]\n"
" [-slow | -fastest] [-2nd | -no2nd] [-slownni] [-seed 1253] \n"
" [-top | -notop] [-topm 1.0 [-close 0.75] [-refresh 0.8]]\n"
" [-gtr] [-gtrrates ac ag at cg ct gt] [-gtrfreq A C G T]\n"
" [ -lg | -wag | -trans transitionmatrixfile ]\n"
" [-matrix Matrix | -nomatrix] [-nj | -bionj]\n"
" [ -constraints constraintAlignment [ -constraintWeight 100.0 ] ]\n"
" [-log logfile]\n"
" [ alignment_file ]\n"
" [ -out output_newick_file | > newick_tree]\n"
"\n"
"or\n"
"\n"
"FastTree [-nt] [-matrix Matrix | -nomatrix] [-rawdist] -makematrix [alignment]\n"
" [-n 100] > phylip_distance_matrix\n"
"\n"
" FastTree supports fasta or phylip interleaved alignments\n"
" By default FastTree expects protein alignments, use -nt for nucleotides\n"
" FastTree reads standard input if no alignment file is given\n"
"\n"
"Input/output options:\n"
" -n -- read in multiple alignments in. This only\n"
" works with phylip interleaved format. For example, you can\n"
" use it with the output from phylip's seqboot. If you use -n, FastTree\n"
" will write 1 tree per line to standard output.\n"
" -intree newickfile -- read the starting tree in from newickfile.\n"
" Any branch lengths in the starting trees are ignored.\n"
" -intree with -n will read a separate starting tree for each alignment.\n"
" -intree1 newickfile -- read the same starting tree for each alignment\n"
" -quiet -- do not write to standard error during normal operation (no progress\n"
" indicator, no options summary, no likelihood values, etc.)\n"
" -nopr -- do not write the progress indicator to stderr\n"
" -log logfile -- save intermediate trees so you can extract\n"
" the trees and restart long-running jobs if they crash\n"
" -log also reports the per-site rates (1 means slowest category)\n"
" -quote -- quote sequence names in the output and allow spaces, commas,\n"
" parentheses, and colons in them but not ' characters (fasta files only)\n"
"\n"
"Distances:\n"
" Default: For protein sequences, log-corrected distances and an\n"
" amino acid dissimilarity matrix derived from BLOSUM45\n"
" or for nucleotide sequences, Jukes-Cantor distances\n"
" To specify a different matrix, use -matrix FilePrefix or -nomatrix\n"
" Use -rawdist to turn the log-correction off\n"
" or to use %different instead of Jukes-Cantor\n"
" (These options affect minimum-evolution computations only;\n"
" use -trans to affect maximum-likelihoood computations)\n"
"\n"
" -pseudo [weight] -- Use pseudocounts to estimate distances between\n"
" sequences with little or no overlap. (Off by default.) Recommended\n"
" if analyzing the alignment has sequences with little or no overlap.\n"
" If the weight is not specified, it is 1.0\n"
"\n"
"Topology refinement:\n"
" By default, FastTree tries to improve the tree with up to 4*log2(N)\n"
" rounds of minimum-evolution nearest-neighbor interchanges (NNI),\n"
" where N is the number of unique sequences, 2 rounds of\n"
" subtree-prune-regraft (SPR) moves (also min. evo.), and\n"
" up to 2*log(N) rounds of maximum-likelihood NNIs.\n"
" Use -nni to set the number of rounds of min. evo. NNIs,\n"
" and -spr to set the rounds of SPRs.\n"
" Use -noml to turn off both min-evo NNIs and SPRs (useful if refining\n"
" an approximately maximum-likelihood tree with further NNIs)\n"
" Use -sprlength set the maximum length of a SPR move (default 10)\n"
" Use -mlnni to set the number of rounds of maximum-likelihood NNIs\n"
" Use -mlacc 2 or -mlacc 3 to always optimize all 5 branches at each NNI,\n"
" and to optimize all 5 branches in 2 or 3 rounds\n"
" Use -mllen to optimize branch lengths without ML NNIs\n"
" Use -mllen -nome with -intree to optimize branch lengths on a fixed topology\n"
" Use -slownni to turn off heuristics to avoid constant subtrees (affects both\n"
" ML and ME NNIs)\n"
"\n"
"Maximum likelihood model options:\n"
" -lg -- Le-Gascuel 2008 model instead of (default) Jones-Taylor-Thorton 1992 model (a.a. only)\n"
" -wag -- Whelan-And-Goldman 2001 model instead of (default) Jones-Taylor-Thorton 1992 model (a.a. only)\n"
" -gtr -- generalized time-reversible instead of (default) Jukes-Cantor (nt only)\n"
" -cat # -- specify the number of rate categories of sites (default 20)\n"
" -nocat -- no CAT model (just 1 category)\n"
" - trans filename -- use the transition matrix from filename\n"
" This is supported for amino acid alignments only\n"
" The file must be tab-delimited with columns in the order ARNDCQEGHILKMFPSTWYV*\n"
" The additional column named * is for the stationary distribution\n"
" Each row must have a row name in the same order ARNDCQEGHILKMFPSTWYV\n"
" -gamma -- after the final round of optimizing branch lengths with the CAT model,\n"
" report the likelihood under the discrete gamma model with the same\n"
" number of categories. FastTree uses the same branch lengths but\n"
" optimizes the gamma shape parameter and the scale of the lengths.\n"
" The final tree will have rescaled lengths. Used with -log, this\n"
" also generates per-site likelihoods for use with CONSEL, see\n"
" GammaLogToPaup.pl and documentation on the FastTree web site.\n"
"\n"
"Support value options:\n"
" By default, FastTree computes local support values by resampling the site\n"
" likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome,\n"
" it will compute minimum-evolution bootstrap supports instead\n"
" In either case, the support values are proportions ranging from 0 to 1\n"
"\n"
" Use -nosupport to turn off support values or -boot 100 to use just 100 resamples\n"
" Use -seed to initialize the random number generator\n"
"\n"
"Searching for the best join:\n"
" By default, FastTree combines the 'visible set' of fast neighbor-joining with\n"
" local hill-climbing as in relaxed neighbor-joining\n"
" -slow -- exhaustive search (like NJ or BIONJ, but different gap handling)\n"
" -slow takes half an hour instead of 8 seconds for 1,250 proteins\n"
" -fastest -- search the visible set (the top hit for each node) only\n"
" Unlike the original fast neighbor-joining, -fastest updates visible(C)\n"
" after joining A and B if join(AB,C) is better than join(C,visible(C))\n"
" -fastest also updates out-distances in a very lazy way,\n"
" -fastest sets -2nd on as well, use -fastest -no2nd to avoid this\n"
"\n"
"Top-hit heuristics:\n"
" By default, FastTree uses a top-hit list to speed up search\n"
" Use -notop (or -slow) to turn this feature off\n"
" and compare all leaves to each other,\n"
" and all new joined nodes to each other\n"
" -topm 1.0 -- set the top-hit list size to parameter*sqrt(N)\n"
" FastTree estimates the top m hits of a leaf from the\n"
" top 2*m hits of a 'close' neighbor, where close is\n"
" defined as d(seed,close) < 0.75 * d(seed, hit of rank 2*m),\n"
" and updates the top-hits as joins proceed\n"
" -close 0.75 -- modify the close heuristic, lower is more conservative\n"
" -refresh 0.8 -- compare a joined node to all other nodes if its\n"
" top-hit list is less than 80% of the desired length,\n"
" or if the age of the top-hit list is log2(m) or greater\n"
" -2nd or -no2nd to turn 2nd-level top hits heuristic on or off\n"
" This reduces memory usage and running time but may lead to\n"
" marginal reductions in tree quality.\n"
" (By default, -fastest turns on -2nd.)\n"
"\n"
"Join options:\n"
" -nj: regular (unweighted) neighbor-joining (default)\n"
" -bionj: weighted joins as in BIONJ\n"
" FastTree will also weight joins during NNIs\n"
"\n"
"Constrained topology search options:\n"
" -constraints alignmentfile -- an alignment with values of 0, 1, and -\n"
" Not all sequences need be present. A column of 0s and 1s defines a\n"
" constrained split. Some constraints may be violated\n"
" (see 'violating constraints:' in standard error).\n"
" -constraintWeight -- how strongly to weight the constraints. A value of 1\n"
" means a penalty of 1 in tree length for violating a constraint\n"
" Default: 100.0\n"
"\n"
"For more information, see http://www.microbesonline.org/fasttree/\n"
" or the comments in the source code\n";
;
#define MAXCODES 20
#define NOCODE 127
/* Note -- sequence lines longer than BUFFER_SIZE are
allowed, but FASTA header lines must be within this limit */
#define BUFFER_SIZE 5000
#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
typedef struct {
int nPos;
int nSeq;
char **names;
char **seqs;
int nSaved; /* actual allocated size of names and seqs */
} alignment_t;
/* For each position in a profile, we have a weight (% non-gapped) and a
frequency vector. (If using a matrix, the frequency vector is in eigenspace).
We also store codes for simple profile positions (all gaps or only 1 value)
If weight[pos] > 0 && codes[pos] == NOCODE then we store the vector
vectors itself is sets of nCodes long, so the vector for the ith nonconstant position
starts at &vectors[nCodes*i]
To speed up comparison of outprofile to a sequence or other simple profile, we also
(for outprofiles) store codeDist[iPos*nCodes+k] = dist(k,profile[iPos])
For constraints, we store a vector of nOn and nOff
If not using constraints, those will be NULL
*/
typedef struct {
/* alignment profile */
numeric_t *weights;
unsigned char *codes;
numeric_t *vectors; /* NULL if no non-constant positions, e.g. for leaves */
int nVectors;
numeric_t *codeDist; /* Optional -- distance to each code at each position */
/* constraint profile */
int *nOn;
int *nOff;
} profile_t;
/* A visible node is a pair of nodes i, j such that j is the best hit of i,
using the neighbor-joining criterion, at the time the comparison was made,
or approximately so since then.
Note that variance = dist because in BIONJ, constant factors of variance do not matter,
and because we weight ungapped sequences higher naturally when averaging profiles,
so we do not take this into account in the computation of "lambda" for BIONJ.
For the top-hit list heuristic, if the top hit list becomes "too short",
we store invalid entries with i=j=-1 and dist/criterion very high.
*/
typedef struct {
int i, j;
numeric_t weight; /* Total product of weights (maximum value is nPos)
This is needed for weighted joins and for pseudocounts,
but not in most other places.
For example, it is not maintained by the top hits code */
numeric_t dist; /* The uncorrected distance (includes diameter correction) */
numeric_t criterion; /* changes when we update the out-profile or change nActive */
} besthit_t;
typedef struct {
int nChild;
int child[3];
} children_t;
typedef struct {
/* Distances between amino acids */
numeric_t distances[MAXCODES][MAXCODES];
/* Inverse of the eigenvalue matrix, for rotating a frequency vector
into eigenspace so that profile similarity computations are
O(alphabet) not O(alphabet*alphabet) time.
*/
numeric_t eigeninv[MAXCODES][MAXCODES];
numeric_t eigenval[MAXCODES]; /* eigenvalues */
/* eigentot=eigeninv times the all-1s frequency vector
useful for normalizing rotated frequency vectors
*/
numeric_t eigentot[MAXCODES];
/* codeFreq is the transpose of the eigeninv matrix is
the rotated frequency vector for each code */
numeric_t codeFreq[MAXCODES][MAXCODES];
numeric_t gapFreq[MAXCODES];
} distance_matrix_t;
/* A transition matrix gives the instantaneous rate of change of frequencies
df/dt = M . f
which is solved by
f(t) = exp(M) . f(0)
and which is not a symmetric matrix because of
non-uniform stationary frequencies stat, so that
M stat = 0
M(i,j) is instantaneous rate of j -> i, not of i -> j
S = diag(sqrt(stat)) is a correction so that
M' = S**-1 M S is symmetric
Let W L W**-1 = M' be an eigendecomposition of M'
Because M' is symmetric, W can be a rotation, and W**-1 = t(W)
Set V = S*W
M = V L V**-1 is an eigendecomposition of M
Note V**-1 = W**-1 S**-1 = t(W) S**-1
Evolution by time t is given by
exp(M*t) = V exp(L*t) V**-1
P(A & B | t) = B . exp(M*t) . (A * stat)
note this is *not* the same as P(A->B | t)
and we can reduce some of the computations from O(a**2) to O(a) time,
where a is the alphabet size, by storing frequency vectors as
t(V) . f = t(W) . t(S) . f
Then
P(f0 & f1 | t) = f1 . exp(M*t) . f0 * (f0 . stat) = sum(r0j * r1j * exp(l_j*t))
where r0 and r1 are the transformed vectors
Posterior distribution of P given children f0 and f1 is given by
P(i | f0, f1, t0, t1) = stat * P(i->f0 | t0) * P(i->f1 | t1)
= P(i & f0 | t0) * P(i & f1 | t1) / stat
~ (V . exp(t0*L) . r0) * (V . exp(t1*L) . r1) / stat
When normalize this posterior distribution (to sum to 1), divide by stat,
and transform by t(V) -- this is the "profile" of internal nodes
To eliminate the O(N**2) step of transforming by t(V), if the posterior
distribution of an amino acid is near 1 then we can approximate it by
P(i) ~= (i==A) * w + nearP(i) * (1-w), where
w is fit so that P(i==A) is correct
nearP = Posterior(i | i, i, 0.1, 0.1) [0.1 is an arbitrary choice]
and we confirm that the approximation works well before we use it.
Given this parameter w we can set
rotated_posterior = rotation(w * (i==A)/stat + (1-w) * nearP/stat)
= codeFreq(A) * w/stat(A) + nearFreq(A) * (1-w)
*/
typedef struct {
numeric_t stat[MAXCODES]; /* The stationary distribution */
numeric_t statinv[MAXCODES]; /* 1/stat */
/* the eigenmatrix, with the eigenvectors as columns and rotations of individual
characters as rows. Also includes a NOCODE entry for gaps */
numeric_t codeFreq[NOCODE+1][MAXCODES];
numeric_t eigeninv[MAXCODES][MAXCODES]; /* Inverse of eigenmatrix */
numeric_t eigeninvT[MAXCODES][MAXCODES]; /* transpose of eigeninv */
numeric_t eigenval[MAXCODES]; /* Eigenvalues */
/* These are for approximate posteriors (off by default) */
numeric_t nearP[MAXCODES][MAXCODES]; /* nearP[i][j] = P(parent=j | both children are i, both lengths are 0.1 */
numeric_t nearFreq[MAXCODES][MAXCODES]; /* rotation of nearP/stat */
} transition_matrix_t;
typedef struct {
int nRateCategories;
numeric_t *rates; /* 1 per rate category */
unsigned int *ratecat; /* 1 category per position */
} rates_t;
typedef struct {
/* The input */
int nSeq;
int nPos;
char **seqs; /* the aligment sequences array (not reallocated) */
distance_matrix_t *distance_matrix; /* a pointer (not reallocated), or NULL if using %identity distance */
transition_matrix_t *transmat; /* a pointer (is allocated), or NULL for Jukes-Cantor */
/* Topological constraints are represented for each sequence as binary characters
with values of '0', '1', or '-' (for missing data)
Sequences that have no constraint may have a NULL string
*/
int nConstraints;
char **constraintSeqs;
/* The profile data structures */
int maxnode; /* The next index to allocate */
int maxnodes; /* Space allocated in data structures below */
profile_t **profiles; /* Profiles of leaves and intermediate nodes */
numeric_t *diameter; /* To correct for distance "up" from children (if any) */
numeric_t *varDiameter; /* To correct variances for distance "up" */
numeric_t *selfdist; /* Saved for use in some formulas */
numeric_t *selfweight; /* Saved for use in some formulas */
/* Average profile of all active nodes, the "outprofile"
* If all inputs are ungapped, this has weight 1 (not nSequences) at each position
* The frequencies all sum to one (or that is implied by the eigen-representation)
*/
profile_t *outprofile;
double totdiam;
/* We sometimes use stale out-distances, so we remember what nActive was */
numeric_t *outDistances; /* Sum of distances to other active (parent==-1) nodes */
int *nOutDistActive; /* What nActive was when this outDistance was computed */
/* the inferred tree */
int root; /* index of the root. Unlike other internal nodes, it has 3 children */
int *parent; /* -1 or index of parent */
children_t *child;
numeric_t *branchlength; /* Distance to parent */
numeric_t *support; /* 1 for high-confidence nodes */
/* auxilliary data for maximum likelihood (defaults to 1 category of rate=1.0) */
rates_t rates;
} NJ_t;
/* Uniquify sequences in an alignment -- map from indices
in the alignment to unique indicies in a NJ_t
*/
typedef struct {
int nSeq;
int nUnique;
int *uniqueFirst; /* iUnique -> iAln */
int *alnNext; /* iAln -> next, or -1 */
int *alnToUniq; /* iAln -> iUnique, or -1 if another was the exemplar */
char **uniqueSeq; /* indexed by iUniq -- points to strings allocated elsewhere */
} uniquify_t;
/* Describes which switch to do */
typedef enum {ABvsCD,ACvsBD,ADvsBC} nni_t;
/* A list of these describes a chain of NNI moves in a rooted tree,
making up, in total, an SPR move
*/
typedef struct {
int nodes[2];
double deltaLength; /* change in tree length for this step (lower is better) */
} spr_step_t;
/* Keep track of hits for the top-hits heuristic without wasting memory
j = -1 means empty
If j is an inactive node, this may be replaced by that node's parent (and dist recomputed)
*/
typedef struct {
int j;
numeric_t dist;
} hit_t;
typedef struct {
int nHits; /* the allocated and desired size; some of them may be empty */
hit_t *hits;
int hitSource; /* where to refresh hits from if a 2nd-level top-hit list, or -1 */
int age; /* number of joins since a refresh */
} top_hits_list_t;
typedef struct {
int m; /* size of a full top hits list, usually sqrt(N) */
int q; /* size of a 2nd-level top hits, usually sqrt(m) */
int maxnodes;
top_hits_list_t *top_hits_lists; /* one per node */
hit_t *visible; /* the "visible" (very best) hit for each node */
/* The top-visible set is a subset, usually of size m, of the visible set --
it is the set of joins to select from
Each entry is either a node whose visible set entry has a good (low) criterion,
or -1 for empty, or is an obsolete node (which is effectively the same).
Whenever we update the visible set, should also call UpdateTopVisible()
which ensures that none of the topvisible set are stale (that is, they
all point to an active node).
*/
int nTopVisible; /* nTopVisible = m * topvisibleMult */
int *topvisible;
int topvisibleAge; /* joins since the top-visible list was recomputed */
#ifdef OPENMP
/* 1 lock to read or write any top hits list, no thread grabs more than one */
omp_lock_t *locks;
#endif
} top_hits_t;
/* Global variables */
/* Options */
int verbose = 1;
int showProgress = 1;
int slow = 0;
int fastest = 0;
bool useTopHits2nd = false; /* use the second-level top hits heuristic? */
int bionj = 0;
double tophitsMult = 1.0; /* 0 means compare nodes to all other nodes */
double tophitsClose = -1.0; /* Parameter for how close is close; also used as a coverage req. */
double topvisibleMult = 1.5; /* nTopVisible = m * topvisibleMult; 1 or 2 did not make much difference
in either running time or accuracy so I chose a compromise. */
double tophitsRefresh = 0.8; /* Refresh if fraction of top-hit-length drops to this */
double tophits2Mult = 1.0; /* Second-level top heuristic -- only with -fastest */
int tophits2Safety = 3; /* Safety factor for second level of top-hits heuristic */
double tophits2Refresh = 0.6; /* Refresh 2nd-level top hits if drops down to this fraction of length */
double staleOutLimit = 0.01; /* nActive changes by at most this amount before we recompute
an out-distance. (Only applies if using the top-hits heuristic) */
double fResetOutProfile = 0.02; /* Recompute out profile from scratch if nActive has changed
by more than this proportion, and */
int nResetOutProfile = 200; /* nActive has also changed more than this amount */
int nCodes=20; /* 20 if protein, 4 if nucleotide */
bool useMatrix=true; /* If false, use %different as the uncorrected distance */
bool logdist = true; /* If true, do a log-correction (scoredist-like or Jukes-Cantor)
but only during NNIs and support values, not during neighbor-joining */
double pseudoWeight = 0.0; /* The weight of pseudocounts to avoid artificial long branches when
nearby sequences in the tree have little or no overlap
(off by default). The prior distance is based on
all overlapping positions among the quartet or triplet under
consideration. The log correction takes place after the
pseudocount is used. */
double constraintWeight = 100.0;/* Cost of violation of a topological constraint in evolutionary distance
or likelihood */
double MEMinDelta = 1.0e-4; /* Changes of less than this in tree-length are discounted for
purposes of identifying fixed subtrees */
bool fastNNI = true;
bool gammaLogLk = false; /* compute gamma likelihood without reoptimizing branch lengths? */
/* Maximum likelihood options and constants */
/* These are used to rescale likelihood values and avoid taking a logarithm at each position */
const double LkUnderflow = 1.0e-4;
const double LkUnderflowInv = 1.0e4;
const double LogLkUnderflow = 9.21034037197618; /* -log(LkUnderflowInv) */
const double Log2 = 0.693147180559945;
/* These are used to limit the optimization of branch lengths.
Also very short branch lengths can create numerical problems.
In version 2.1.7, the minimum branch lengths (MLMinBranchLength and MLMinRelBranchLength)
were increased to prevent numerical problems in rare cases.
In version 2.1.8, to provide useful branch lengths for genome-wide alignments,
the minimum branch lengths were dramatically decreased if USE_DOUBLE is defined.
*/
#ifndef USE_DOUBLE
const double MLMinBranchLengthTolerance = 1.0e-4; /* absolute tolerance for optimizing branch lengths */
const double MLFTolBranchLength = 0.001; /* fractional tolerance for optimizing branch lengths */
const double MLMinBranchLength = 5.0e-4; /* minimum value for branch length */
const double MLMinRelBranchLength = 2.5e-4; /* minimum of rate * length */
const double fPostTotalTolerance = 1.0e-10; /* posterior vector must sum to at least this before rescaling */
#else
const double MLMinBranchLengthTolerance = 1.0e-9;
const double MLFTolBranchLength = 0.001;
const double MLMinBranchLength = 5.0e-9;
const double MLMinRelBranchLength = 2.5e-9;
const double fPostTotalTolerance = 1.0e-20;
#endif
int mlAccuracy = 1; /* Rounds of optimization of branch lengths; 1 means do 2nd round only if close */
double closeLogLkLimit = 5.0; /* If partial optimization of an NNI looks like it would decrease the log likelihood
by this much or more then do not optimize it further */
double treeLogLkDelta = 0.1; /* Give up if tree log-lk changes by less than this; NNIs that change
likelihood by less than this also are considered unimportant
by some heuristics */
bool exactML = true; /* Exact or approximate posterior distributions for a.a.s */
double approxMLminf = 0.95; /* Only try to approximate posterior distributions if max. value is at least this high */
double approxMLminratio = 2/3.0;/* Ratio of approximated/true posterior values must be at least this high */
double approxMLnearT = 0.2; /* 2nd component of near-constant posterior distribution uses this time scale */
const int nDefaultRateCats = 20;
/* Performance and memory usage */
long profileOps = 0; /* Full profile-based distance operations */
long outprofileOps = 0; /* How many of profileOps are comparisons to outprofile */
long seqOps = 0; /* Faster leaf-based distance operations */
long profileAvgOps = 0; /* Number of profile-average steps */
long nHillBetter = 0; /* Number of hill-climbing steps */
long nCloseUsed = 0; /* Number of "close" neighbors we avoid full search for */
long nClose2Used = 0; /* Number of "close" neighbors we use 2nd-level top hits for */
long nRefreshTopHits = 0; /* Number of full-blown searches (interior nodes) */
long nVisibleUpdate = 0; /* Number of updates of the visible set */
long nNNI = 0; /* Number of NNI changes performed */
long nSPR = 0; /* Number of SPR changes performed */
long nML_NNI = 0; /* Number of max-lik. NNI changes performed */
long nSuboptimalSplits = 0; /* # of splits that are rejected given final tree (during bootstrap) */
long nSuboptimalConstrained = 0; /* Bad splits that are due to constraints */
long nConstraintViolations = 0; /* Number of constraint violations */
long nProfileFreqAlloc = 0;
long nProfileFreqAvoid = 0;
long szAllAlloc = 0;
long mymallocUsed = 0; /* useful allocations by mymalloc */
long maxmallocHeap = 0; /* Maximum of mi.arena+mi.hblkhd from mallinfo (actual mem usage) */
long nLkCompute = 0; /* # of likelihood computations for pairs of probability vectors */
long nPosteriorCompute = 0; /* # of computations of posterior probabilities */
long nAAPosteriorExact = 0; /* # of times compute exact AA posterior */
long nAAPosteriorRough = 0; /* # of times use rough approximation */
long nStarTests = 0; /* # of times we use star test to avoid testing an NNI */
/* Protein character set */
unsigned char *codesStringAA = (unsigned char*) "ARNDCQEGHILKMFPSTWYV";
unsigned char *codesStringNT = (unsigned char*) "ACGT";
unsigned char *codesString = NULL;
distance_matrix_t *ReadDistanceMatrix(char *prefix);
void SetupDistanceMatrix(/*IN/OUT*/distance_matrix_t *); /* set eigentot, codeFreq, gapFreq */
void ReadMatrix(char *filename, /*OUT*/numeric_t codes[MAXCODES][MAXCODES], bool check_codes);
void ReadVector(char *filename, /*OUT*/numeric_t codes[MAXCODES]);
alignment_t *ReadAlignment(/*READ*/FILE *fp, bool bQuote); /* Returns a list of strings (exits on failure) */
alignment_t *FreeAlignment(alignment_t *); /* returns NULL */
void FreeAlignmentSeqs(/*IN/OUT*/alignment_t *);
/* Takes as input the transpose of the matrix V, with i -> j
This routine takes care of setting the diagonals
*/
transition_matrix_t *CreateTransitionMatrix(/*IN*/double matrix[MAXCODES][MAXCODES],
/*IN*/double stat[MAXCODES]);
transition_matrix_t *CreateGTR(double *gtrrates/*ac,ag,at,cg,ct,gt*/, double *gtrfreq/*ACGT*/);
transition_matrix_t *ReadAATransitionMatrix(/*IN*/char *filename);
/* For converting profiles from 1 rotation to another, or converts NULL to NULL */
distance_matrix_t *TransMatToDistanceMat(transition_matrix_t *transmat);
/* Allocates memory, initializes leaf profiles */
NJ_t *InitNJ(char **sequences, int nSeqs, int nPos,
/*IN OPTIONAL*/char **constraintSeqs, int nConstraints,
/*IN OPTIONAL*/distance_matrix_t *,
/*IN OPTIONAL*/transition_matrix_t *);
NJ_t *FreeNJ(NJ_t *NJ); /* returns NULL */
void FastNJ(/*IN/OUT*/NJ_t *NJ); /* Does the joins */
void ReliabilityNJ(/*IN/OUT*/NJ_t *NJ, int nBootstrap); /* Estimates the reliability of the joins */
/* nni_stats_t is meaningless for leaves and root, so all of those entries
will just be high (for age) or 0 (for delta)
*/
typedef struct {
int age; /* number of rounds since this node was modified by an NNI */
int subtreeAge; /* number of rounds since self or descendent had a significant improvement */
double delta; /* improvement in score for this node (or 0 if no change) */
double support; /* improvement of score for self over better of alternatives */
} nni_stats_t;
/* One round of nearest-neighbor interchanges according to the
minimum-evolution or approximate maximum-likelihood criterion.
If doing maximum likelihood then this modifies the branch lengths.
age is the # of rounds since a node was NNId
Returns the # of topological changes performed
*/
int NNI(/*IN/OUT*/NJ_t *NJ, int iRound, int nRounds, bool useML,
/*IN/OUT*/nni_stats_t *stats,
/*OUT*/double *maxDeltaCriterion);
nni_stats_t *InitNNIStats(NJ_t *NJ);
nni_stats_t *FreeNNIStats(nni_stats_t *, NJ_t *NJ); /* returns NULL */
/* One round of subtree-prune-regraft moves (minimum evolution) */
void SPR(/*IN/OUT*/NJ_t *NJ, int maxSPRLength, int iRound, int nRounds);
/* Recomputes all branch lengths by minimum evolution criterion*/
void UpdateBranchLengths(/*IN/OUT*/NJ_t *NJ);
/* Recomputes all branch lengths and, optionally, internal profiles */
double TreeLength(/*IN/OUT*/NJ_t *NJ, bool recomputeProfiles);
typedef struct {
int nBadSplits;
int nConstraintViolations;
int nBadBoth;
int nSplits;
/* How much length would be reduce or likelihood would be increased by the
best NNI we find (the worst "miss") */
double dWorstDeltaUnconstrained;
double dWorstDeltaConstrained;
} SplitCount_t;
void TestSplitsMinEvo(NJ_t *NJ, /*OUT*/SplitCount_t *splitcount);
/* Sets SH-like support values if nBootstrap>0 */