forked from mothur/mothur
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clearcut.cpp
2159 lines (1664 loc) · 49.7 KB
/
clearcut.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* clearcut.c
*
* $Id$
*
*****************************************************************************
*
* Copyright (c) 2004, Luke Sheneman
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* + Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* + Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* + The names of its contributors may not be used to endorse or promote
* products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*****************************************************************************
*
* An implementation of the Relaxed Neighbor-Joining algorithm
* of Evans, J., Sheneman, L., and Foster, J.
*
*
* AUTHOR:
*
* Luke Sheneman
*
*/
#include <stdio.h>
#include <string.h>
#include <limits.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>
#include <float.h>
#include "distclearcut.h"
#include "dmat.h"
#include "fasta.h"
#include "cmdargs.h"
#include "common.h"
#include "clearcut.h"
#include "prng.h"
/*
* main() -
*
* The entry point to the program.
*
*/
int clearcut_main(int argc, char *argv[]) {
DMAT *dmat; /* The working distance matrix */
DMAT *dmat_backup = NULL;/* A backup distance matrix */
NJ_TREE *tree; /* The phylogenetic tree */
NJ_ARGS *nj_args; /* Structure for holding command-line arguments */
long int i;
/* some variables for tracking time */
struct timeval tv;
unsigned long long startUs, endUs;
/* check and parse supplied command-line arguments */
nj_args = NJ_handle_args(argc, argv);
if(!nj_args) {
fprintf(stderr, "Clearcut: Error processing command-line arguments.\n");
exit(-1);
}
/* for verbose reporting, print the random number seed to stdout */
if(nj_args->verbose_flag) {
printf("PRNG SEED: %d\n", nj_args->seed);
}
/* Initialize Mersenne Twister PRNG */
init_genrand(nj_args->seed);
switch(nj_args->input_mode) {
/* If the input type is a distance matrix */
case NJ_INPUT_MODE_DISTANCE:
/* parse the distance matrix */
dmat = NJ_parse_distance_matrix(nj_args);
if(!dmat) {
exit(-1);
}
break;
/* If the input type is a multiple sequence alignment */
case NJ_INPUT_MODE_ALIGNED_SEQUENCES:
/* build a distance matrix from a multiple sequence alignment */
dmat = NJ_build_distance_matrix(nj_args);
if(!dmat) {
fprintf(stderr, "Clearcut: Failed to build distance matrix from alignment.\n");
exit(-1);
}
break;
default:
fprintf(stderr, "Clearcut: Could not determine how to process input\n");
exit(-1);
}
/*
* Output the computed distance matrix,
* if the user specified one.
*/
if(nj_args->matrixout) {
NJ_output_matrix(nj_args, dmat);
}
/*
* If we are going to generate multiple trees from
* the same distance matrix, we need to make a backup
* of the original distance matrix.
*/
if(nj_args->ntrees > 1) {
dmat_backup = NJ_dup_dmat(dmat);
}
/* process n trees */
for(i=0;i<nj_args->ntrees;i++) {
/*
* If the user has specified matrix shuffling, we need
* to randomize the distance matrix
*/
if(nj_args->shuffle) {
NJ_shuffle_distance_matrix(dmat);
}
/* RECORD THE PRECISE TIME OF THE START OF THE NEIGHBOR-JOINING */
gettimeofday(&tv, NULL);
startUs = ((unsigned long long) tv.tv_sec * 1000000ULL)
+ ((unsigned long long) tv.tv_usec);
/*
* Invoke either the Relaxed Neighbor-Joining algorithm (default)
* or the "traditional" Neighbor-Joining algorithm
*/
if(nj_args->neighbor) {
tree = NJ_neighbor_joining(nj_args, dmat);
} else {
tree = NJ_relaxed_nj(nj_args, dmat);
}
if(!tree) {
fprintf(stderr, "Clearcut: Failed to construct tree.\n");
exit(0);
}
/* RECORD THE PRECISE TIME OF THE END OF THE NEIGHBOR-JOINING */
gettimeofday(&tv, NULL);
endUs = ((unsigned long long) tv.tv_sec * 1000000ULL)
+ ((unsigned long long) tv.tv_usec);
/* print the time taken to perform the neighbor join */
if(nj_args->verbose_flag) {
if(nj_args->neighbor) {
fprintf(stderr, "NJ tree built in %llu.%06llu secs\n",
(endUs - startUs) / 1000000ULL,
(endUs - startUs) % 1000000ULL);
} else {
fprintf(stderr, "RNJ tree built in %llu.%06llu secs\n",
(endUs - startUs) / 1000000ULL,
(endUs - startUs) % 1000000ULL);
}
}
/* Output the neighbor joining tree here */
NJ_output_tree(nj_args, tree, dmat, i);
NJ_free_tree(tree); /* Free the tree */
NJ_free_dmat(dmat); /* Free the working distance matrix */
/*
* If we need to do another iteration, lets re-initialize
* our working distance matrix.
*/
if(nj_args->ntrees > 1 && i<(nj_args->ntrees-1) ) {
dmat = NJ_dup_dmat(dmat_backup);
}
}
/* Free the backup distance matrix */
if(nj_args->ntrees > 1) {
NJ_free_dmat(dmat_backup);
}
/* If verbosity, describe where the tree output is */
if(nj_args->verbose_flag) {
if(nj_args->neighbor) {
printf("NJ tree(s) in %s\n", nj_args->outfilename);
} else {
printf("Relaxed NJ tree(s) in %s\n", nj_args->outfilename);
}
}
return 0;
}
/*
* NJ_find_hmin() - Find minimum transformed values along horizontal
*
*
* INPUTS:
* -------
* dmat -- The distance matrix
* a -- The index of the specific taxon in the distance matrix
*
* RETURNS:
* --------
* <float> -- The value of the selected minimum
* min -- Used to transport the index of the minima out
* of the function (by reference)
* hmincount -- Return the number of minima along the horizontal
* (by reference)
*
*
* DESCRIPTION:
* ------------
*
* A fast, inline function to find the smallest transformed value
* along the "horizontal" portion of an entry in a distance matrix.
*
* Distance matrices are stored internally as continguously-allocated
* upper-diagonal structures. With the exception of the taxa at
* row 0 of this upper-diagonal matrix, all taxa have both a horizontal
* and vertical component in the distance matrix. This function
* scans the horizonal portion of the entry in the distance matrix
* for the specified taxon and finds the minimum transformed value
* along that horizontal component.
*
* Since multiple minima can exist along the horizontal portion
* of the entry, I consider all minima and break ties
* stochastically to help avoid systematic bias.
*
* Just searching along the horizontal portion of a row is very fast
* since the data is stored linearly and contiguously in memory and
* cache locality is exploited in the distance matrix representation.
*
* Look at nj.h for more information on how the distance matrix
* is architected.
*
*/
static inline
float
NJ_find_hmin(DMAT *dmat,
long int a,
long int *min,
long int *hmincount) {
long int i; /* index variable for looping */
int size; /* current size of distance matrix */
int mindex = 0; /* holds the current index to the chosen minimum */
float curval; /* used to hold current transformed values */
float hmin; /* the value of the transformed minimum */
float *ptr, *r2, *val; /* pointers used to reduce dereferencing in inner loop */
/* values used for stochastic selection among multiple minima */
float p, x;
long int smallcnt;
/* initialize the min to something large */
hmin = (float)HUGE_VAL;
/* setup some pointers to limit dereferencing later */
r2 = dmat->r2;
val = dmat->val;
size = dmat->size;
/* initialize values associated with minima tie breaking */
p = 1.0;
smallcnt = 0;
ptr = &(val[NJ_MAP(a, a+1, size)]); /* at the start of the horiz. part */
for(i=a+1;i<size;i++) {
curval = *(ptr++) - (r2[a] + r2[i]); /* compute transformed distance */
if(NJ_FLT_EQ(curval, hmin)) { /* approx. equal */
smallcnt++;
p = 1.0/(float)smallcnt;
x = genrand_real2();
/* select this minimum in a way which is proportional to
the number of minima found along the row so far */
if( x < p ) {
mindex = i;
}
} else if (curval < hmin) {
smallcnt = 1;
hmin = curval;
mindex = i;
}
}
/* save off the the minimum index to be returned via reference */
*min = mindex;
/* save off the number of minima */
*hmincount = smallcnt;
/* return the value of the smallest tranformed distance */
return(hmin);
}
/*
* NJ_find_vmin() - Find minimum transformed distance along vertical
*
*
* INPUTS:
* -------
* dmat -- The distance matrix
* a -- The index of the specific taxon in the distance matrix
*
*
* RETURNS:
* --------
* <float> -- The value of the selected minimum
* min -- Used to transport the index of the minima out
* of the function (by reference)
* vmincount -- The number of minima along the vertical
* return by reference.
*
* DESCRIPTION:
* ------------
*
* A fast, inline function to find the smallest transformed value
* along the "vertical" portion of an entry in a distance matrix.
*
* Distance matrices are stored internally as continguously-allocated
* upper-diagonal matrices. With the exception of the taxa at
* row 0 of this upper-diagonal matrix, all taxa have both a horizontal
* and vertical component in the distance matrix. This function
* scans the vertical portion of the entry in the distance matrix
* for the specified taxon and finds the minimum transformed value
* along that vertical component.
*
* Since multiple minima can exist along the vertical portion
* of the entry, I consider all minima and break ties
* stochastically to help avoid systematic bias.
*
* Due to cache locality reasons, searching along the vertical
* component is going to be considerably slower than searching
* along the horizontal.
*
* Look at nj.h for more information on how the distance matrix
* is architected.
*
*/
static inline
float
NJ_find_vmin(DMAT *dmat,
long int a,
long int *min,
long int *vmincount) {
long int i; /* index variable used for looping */
long int size; /* track the size of the matrix */
long int mindex = 0;/* track the index to the minimum */
float curval; /* track value of current transformed distance */
float vmin; /* the index to the smallest "vertical" minimum */
/* pointers which are used to reduce pointer dereferencing in inner loop */
float *ptr, *r2, *val;
/* values used in stochastically breaking ties */
float p, x;
long int smallcnt;
/* initialize the vertical min to something really big */
vmin = (float)HUGE_VAL;
/* save off some values to limit dereferencing later */
r2 = dmat->r2;
val = dmat->val;
size = dmat->size;
p = 1.0;
smallcnt = 0;
/* start on the first row and work down */
ptr = &(val[NJ_MAP(0, a, size)]);
for(i=0;i<a;i++) {
curval = *ptr - (r2[i] + r2[a]); /* compute transformed distance */
if(NJ_FLT_EQ(curval, vmin)) { /* approx. equal */
smallcnt++;
p = 1.0/(float)smallcnt;
x = genrand_real2();
/* break ties stochastically to avoid systematic bias */
if( x < p ) {
mindex = i;
}
} else if (curval < vmin) {
smallcnt = 1;
vmin = curval;
mindex = i;
}
/* increment our working pointer to the next row down */
ptr += size-i-1;
}
/* pass back the index to the minimum found so far (by reference) */
*min = mindex;
/* pass back the number of minima along the vertical */
*vmincount = smallcnt;
/* return the value of the smallest transformed distance */
return(vmin);
}
/*
* NJ_permute() - Generate random permutation using the provably unbiased
* Fisher-Yates Shuffle.
*
* INPUTS:
* -------
* perm -- A pointer to the array of long ints which will be filled.
* size -- the length of the permutation vector
*
*
* OUTPUTS:
* -------
* NONE
*
*
* DESCRIPTION:
* ------------
*
* Return a permuted list of numbers from 0 through size.
* This is accomplished by initializing the permutation
* as an ordered list of integers and then iterating
* through and swapping two values selected according to the
* Fisher-Yates method.
*
* This unbiased method for random permutation generation is
* discussed in:
*
* Donald E. Knuth, The Art of Computer Programming,
* Addison-Wesley, Volumes 1, 2, and 3, 3rd edition, 1998
*
*/
static inline
void
NJ_permute(long int *perm,
long int size) {
long int i; /* index used for looping */
long int swap; /* we swap values to generate permutation */
long int tmp; /* used for swapping values */
/* check to see if vector of long ints is valid */
if(!perm) {
fprintf(stderr, "Clearcut: NULL permutation pointer in NJ_permute()\n");
exit(-1);
}
/* init permutation as an ordered list of integers */
for(i=0;i<size;i++) {
perm[i] = i;
}
/*
* Iterate across the array from i = 0 to size -1, swapping ith element
* with a randomly chosen element from a changing range of possible values
*/
for(i=0;i<size;i++) {
/* choose which element we will swap with */
swap = i + NJ_genrand_int31_top(size-i);
/* swap elements here */
if(i != swap) {
tmp = perm[swap];
perm[swap] = perm[i];
perm[i] = tmp;
}
}
return;
}
/*
* NJ_compute_r() - Compute post-join changes to r-vector. In this
* case, we decrement all of the accumulated distances
* in the r-vector for the two nodes that were
* recently joined (i.e. x, y)
*
* INPUTS:
* -------
* dmat -- The distance matrix
* a -- The index of one of the taxa that were joined
* b -- The index of the other taxa that was joined
*
* RETURNS:
* --------
* NONE
*
* DESCRIPTION:
* ------------
*
* This vector of floats is used as a summary of overall distances from
* each entry in the distance matrix to every other entry. These values
* are then used when computing the transformed distances from which
* decisions concerning joining are made.
*
* For speed, we don't recompute r from scratch. Instead, we decrement
* all entries in r by the appropriate amount. That is, r[i] -= dist(i, a)
* and r[i] -= dist(i, b).
*
* As a speed optimization, I process the rows altogether for cache locality
* purposes, and then process columns.
*
* The processing of the scaled r matrix (r2) is handled on-the-fly elsewhere.
*
*/
static inline
void
NJ_compute_r(DMAT *dmat,
long int a,
long int b) {
long int i; /* a variable used in indexing */
float *ptrx, *ptry; /* pointers into the distance matrix */
/* some variables to limit pointer dereferencing in loop */
long int size;
float *r, *val;
/* to limit pointer dereferencing */
size = dmat->size;
val = dmat->val;
r = dmat->r+a+1;
/*
* Loop through the rows and decrement the stored r values
* by the distances stored in the rows and columns of the distance
* matrix which are being removed post-join.
*
* We do the rows altogether in order to benefit from cache locality.
*/
ptrx = &(val[NJ_MAP(a, a+1, size)]);
ptry = &(val[NJ_MAP(b, b+1, size)]);
for(i=a+1;i<size;i++) {
*r -= *(ptrx++);
if(i>b) {
*r -= *(ptry++);
}
r++;
}
/* Similar to the above loop, we now do the columns */
ptrx = &(val[NJ_MAP(0, a, size)]);
ptry = &(val[NJ_MAP(0, b, size)]);
r = dmat->r;
for(i=0;i<b;i++) {
if(i<a) {
*r -= *ptrx;
ptrx += size-i-1;
}
*r -= *ptry;
ptry += size-i-1;
r++;
}
return;
}
/*
* NJ_check_additivity() - Check to make sure that addivity preserved by join
*
*
* INPUTS:
* -------
* dmat -- distance matrix
* a -- index into dmat for one of the rows to be joined
* b -- index into dmat for another row to be joined
*
* OUTPUTS:
* --------
* int 1 if join adheres to additivity constraint
* 0 if join does breaks additivity
*
* DESCRIPTION:
* ------------
*
* Here we perform the check to make sure that by joining a and b we do not
* also break consistency (i.e. preserves additivity) with the distances between
* the taxa in the new clade and other nodes in the tree. This is done quite
* efficiently by looking up the untransformed distance between node b and
* some other "target" taxa in the distance matrix (which is not a nor b) and
* comparing that distance to the distance computed by finding the distance
* from node a to the proposed internal node "x" which joins (a,b).
*
* If dist(x,b) + dist (b, target) == dist(b, target) then additivity is
* preserved, otherwise, additivity is not preserved. If we are in
* additivity mode, this join should be rejected.
*
*/
static inline
int
NJ_check_additivity(DMAT *dmat,
long int a,
long int b) {
float a2clade, b2clade;
float clade_dist;
long int target;
/* determine target taxon here */
if(b == dmat->size-1) {
/* if we can't do a row here, lets do a column */
if(a==0) {
if(b==1) {
target = 2;
} else {
target = 1;
}
} else {
target = 0;
}
} else {
target = b+1;
}
/* distance between a and the root of clade (a,b) */
a2clade =
( (dmat->val[NJ_MAP(a, b, dmat->size)]) +
(dmat->r2[a] - dmat->r2[b]) ) / 2.0;
/* distance between b and the root of clade (a,b) */
b2clade =
( (dmat->val[NJ_MAP(a, b, dmat->size)]) +
(dmat->r2[b] - dmat->r2[a]) ) / 2.0;
/* distance between the clade (a,b) and the target taxon */
if(b<target) {
/* compute the distance from the clade root to the target */
clade_dist =
( (dmat->val[NJ_MAP(a, target, dmat->size)] - a2clade) +
(dmat->val[NJ_MAP(b, target, dmat->size)] - b2clade) ) / 2.0;
/*
* Check to see that distance from clade root to target + distance from
* b to clade root are equal to the distance from b to the target
*/
if(NJ_FLT_EQ(dmat->val[NJ_MAP(b, target, dmat->size)],
(clade_dist + b2clade))) {
return(1); /* join is legitimate */
} else {
return(0); /* join is illigitimate */
}
} else {
/* compute the distance from the clade root to the target */
clade_dist =
( (dmat->val[NJ_MAP(target, a, dmat->size)] - a2clade) +
(dmat->val[NJ_MAP(target, b, dmat->size)] - b2clade) ) / 2.0;
/*
* Check to see that distance from clade root to target + distance from
* b to clade root are equal to the distance from b to the target
*/
if(NJ_FLT_EQ(dmat->val[NJ_MAP(target, b, dmat->size)],
(clade_dist + b2clade))) {
return(1); /* join is legitimate */
} else {
return(0); /* join is illegitimate */
}
}
}
/*
* NJ_check() - Check to see if two taxa can be joined
*
* INPUTS:
* -------
* nj_args -- Pointer to the data structure holding command-line args
* dmat -- distance matrix
* a -- index into dmat for one of the rows to be joined
* b -- index into dmat for another row to be joined
* min -- the minimum value found
* additivity -- a flag (0 = not additive mode, 1 = additive mode)
*
* OUTPUTS:
* --------
* int 1 if join is okay
* 0 if join is not okay
*
* DESCRIPTION:
* ------------
*
* This function ultimately takes two rows and makes sure that the
* intersection of those two rows, which has a transformed distance of
* "min", is actually the smallest (or equal to the smallest)
* transformed distance for both rows (a, b). If so, it returns
* 1, else it returns 0.
*
* Basically, we want to join two rows only if the minimum
* transformed distance on either row is at the intersection of
* those two rows.
*
*/
static inline
int
NJ_check(NJ_ARGS *nj_args,
DMAT *dmat,
long int a,
long int b,
float min,
int additivity) {
long int i, size;
float *ptr, *val, *r2;
/* some aliases for speed and readability reasons */
val = dmat->val;
r2 = dmat->r2;
size = dmat->size;
/* now determine if joining a, b will result in broken distances */
if(additivity) {
if(!NJ_check_additivity(dmat, a, b)) {
return(0);
}
}
/* scan the horizontal of row b, punt if anything < min */
ptr = &(val[NJ_MAP(b, b+1, size)]);
for(i=b+1;i<size;i++) {
if( NJ_FLT_LT( (*ptr - (r2[b] + r2[i])), min) ) {
return(0);
}
ptr++;
}
/* scan the vertical component of row a, punt if anything < min */
if(nj_args->norandom) { /* if we are doing random joins, we checked this */
ptr = val + a;
for(i=0;i<a;i++) {
if( NJ_FLT_LT( (*ptr - (r2[i] + r2[a])), min) ) {
return(0);
}
ptr += size-i-1;
}
}
/* scan the vertical component of row b, punt if anything < min */
ptr = val + b;
for(i=0;i<b;i++) {
if( NJ_FLT_LT( (*ptr - (r2[i] + r2[b])), min) && i!=a) {
return(0);
}
ptr += size-i-1;
}
return(1);
}
/*
* NJ_collapse() - Collapse the distance matrix by removing
* rows a and b from the distance matrix and
* replacing them with a single new row which
* represents the internal node joining a and b
*
*
* INPUTS:
* -------
* dmat -- A pointer to the distance matrix
* vertex -- A pointer to the vertex vector (vector of tree nodes)
* which is used in constructing the tree
* a -- An index to a row in the distance matrix from which we
* joined. This row will be collapsed.
* b -- An index to a row in the distance matrix from which we
* joined. This row will be collapsed.
*
* RETURNS:
* --------
* NONE
*
*
* DESCRIPTION:
* ------------
*
* This function collapses the distance matrix in a way which optimizes
* cache locality and ultimately gives us a speed improvement due to
* cache. At this point, we've decided to join rows a and b from
* the distance matrix. We will remove rows a and b from the distance
* matrix and replace them with a new row which represents the internal
* node which joins rows a and b together.
*
* We always keep the matrix as compact as possible in order to
* get good performance from our cache in subsequent operations. Cache
* is the key to good performance here.
*
* Key Steps:
* ----------
*
* 1) Fill the "a" row with the new distances of the internal node
* joining a and b to all other rows.
* 2) Copy row 0 into what was row b
* 3) Increment the pointer to the start of the distance matrix
* by one row.
* 4) Decrement the size of the matrix by one row.
* 5) Do roughly the same thing to the r vector in order to
* keep it in sync with the distance matrix.
* 6) Compute the scaled r vector (r2) based on the updated
* r vector
*
* This keeps the distance matrix as compact as possible in memory, and
* is a relatively fast operation.
*
* This function requires that a < b
*
*/
static inline
void
NJ_collapse(DMAT *dmat,
NJ_VERTEX *vertex,
long int a,
long int b) {
long int i; /* index used for looping */
long int size; /* size of dmat --> reduce pointer dereferencing */
float a2clade; /* distance from a to the new node that joins a and b */
float b2clade; /* distance from b to the new node that joins a and b */
float cval; /* stores distance information during loop */
float *vptr; /* pointer to elements in first row of dist matrix */
float *ptra; /* pointer to elements in row a of distance matrix */
float *ptrb; /* pointer to elements in row b of distance matrix */
float *val, *r, *r2; /* simply used to limit pointer dereferencing */
/* We must assume that a < b */
if(a >= b) {
fprintf(stderr, "Clearcut: (a<b) constraint check failed in NJ_collapse()\n");
exit(0);
}
/* some shortcuts to help limit dereferencing */
val = dmat->val;
r = dmat->r;
r2 = dmat->r2;
size = dmat->size;
/* compute the distance from the clade components (a, b) to the new node */
a2clade =
( (val[NJ_MAP(a, b, size)]) + (dmat->r2[a] - dmat->r2[b]) ) / 2.0;
b2clade =
( (val[NJ_MAP(a, b, size)]) + (dmat->r2[b] - dmat->r2[a]) ) / 2.0;
r[a] = 0.0; /* we are removing row a, so clear dist. in r */
/*
* Fill the horizontal part of the "a" row and finish computing r and r2
* we handle the horizontal component first to maximize cache locality
*/
ptra = &(val[NJ_MAP(a, a+1, size)]); /* start ptra at the horiz. of a */
ptrb = &(val[NJ_MAP(a+1, b, size)]); /* start ptrb at comparable place */
for(i=a+1;i<size;i++) {
/*
* Compute distance from new internal node to others in
* the distance matrix.
*/
cval =
( (*ptra - a2clade) +
(*ptrb - b2clade) ) / 2.0;
/* incr. row b pointer differently depending on where i is in loop */
if(i<b) {
ptrb += size-i-1; /* traverse vertically by incrementing by row */
} else {
ptrb++; /* traverse horiz. by incrementing by column */
}
/* assign the newly computed distance and increment a ptr by a column */
*(ptra++) = cval;
/* accumulate the distance onto the r vector */
r[a] += cval;
r[i] += cval;
/* scale r2 on the fly here */
r2[i] = r[i]/(float)(size-3);
}
/* fill the vertical part of the "a" column and finish computing r and r2 */
ptra = val + a; /* start at the top of the columb for "a" */
ptrb = val + b; /* start at the top of the columb for "b" */
for(i=0;i<a;i++) {
/*
* Compute distance from new internal node to others in
* the distance matrix.
*/
cval =
( (*ptra - a2clade) +
(*ptrb - b2clade) ) / 2.0;