/* fastcluster: Fast hierarchical clustering routines for R and Python Copyright © 2011 Daniel Müllner This library implements various fast algorithms for hierarchical, agglomerative clustering methods: (1) Algorithms for the "stored matrix approach": the input is the array of pairwise dissimilarities. MST_linkage_core: single linkage clustering with the "minimum spanning tree algorithm (Rohlfs) NN_chain_core: nearest-neighbor-chain algorithm, suitable for single, complete, average, weighted and Ward linkage (Murtagh) generic_linkage: generic algorithm, suitable for all distance update formulas (Müllner) (2) Algorithms for the "stored data approach": the input are points in a vector space. MST_linkage_core_vector: single linkage clustering for vector data generic_linkage_vector: generic algorithm for vector data, suitable for the Ward, centroid and median methods. generic_linkage_vector_alternative: alternative scheme for updating the nearest neighbors. This method seems faster than "generic_linkage_vector" for the centroid and median methods but slower for the Ward method. All these implementation treat infinity values correctly. They throw an exception if a NaN distance value occurs. */ // Older versions of Microsoft Visual Studio do not have the fenv header. #ifdef _MSC_VER #if (_MSC_VER == 1500 || _MSC_VER == 1600) #define NO_INCLUDE_FENV #endif #endif // NaN detection via fenv might not work on systems with software // floating-point emulation (bug report for Debian armel). #ifdef __SOFTFP__ #define NO_INCLUDE_FENV #endif #ifdef NO_INCLUDE_FENV #pragma message("Do not use fenv header.") #else //#pragma message("Use fenv header. If there is a warning about unknown #pragma STDC FENV_ACCESS, this can be ignored.") //#pragma STDC FENV_ACCESS on #include #endif #include // for std::pow, std::sqrt #include // for std::ptrdiff_t #include // for std::numeric_limits<...>::infinity() #include // for std::fill_n #include // for std::runtime_error #include // for std::string #include // also for DBL_MAX, DBL_MIN #ifndef DBL_MANT_DIG #error The constant DBL_MANT_DIG could not be defined. #endif #define T_FLOAT_MANT_DIG DBL_MANT_DIG #ifndef LONG_MAX #include #endif #ifndef LONG_MAX #error The constant LONG_MAX could not be defined. #endif #ifndef INT_MAX #error The constant INT_MAX could not be defined. #endif #ifndef INT32_MAX #ifdef _MSC_VER #if _MSC_VER >= 1600 #define __STDC_LIMIT_MACROS #include #else typedef __int32 int_fast32_t; typedef __int64 int64_t; #endif #else #define __STDC_LIMIT_MACROS #include #endif #endif #define FILL_N std::fill_n #ifdef _MSC_VER #if _MSC_VER < 1600 #undef FILL_N #define FILL_N stdext::unchecked_fill_n #endif #endif // Suppress warnings about (potentially) uninitialized variables. #ifdef _MSC_VER #pragma warning (disable:4700) #endif #ifndef HAVE_DIAGNOSTIC #if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6)) #define HAVE_DIAGNOSTIC 1 #endif #endif #ifndef HAVE_VISIBILITY #if __GNUC__ >= 4 #define HAVE_VISIBILITY 1 #endif #endif /* Since the public interface is given by the Python respectively R interface, * we do not want other symbols than the interface initalization routines to be * visible in the shared object file. The "visibility" switch is a GCC concept. * Hiding symbols keeps the relocation table small and decreases startup time. * See http://gcc.gnu.org/wiki/Visibility */ #if HAVE_VISIBILITY #pragma GCC visibility push(hidden) #endif typedef int_fast32_t t_index; #ifndef INT32_MAX #define MAX_INDEX 0x7fffffffL #else #define MAX_INDEX INT32_MAX #endif #if (LONG_MAX < MAX_INDEX) #error The integer format "t_index" must not have a greater range than "long int". #endif #if (INT_MAX > MAX_INDEX) #error The integer format "int" must not have a greater range than "t_index". #endif typedef double t_float; /* Method codes. These codes must agree with the METHODS array in fastcluster.R and the dictionary mthidx in fastcluster.py. */ enum method_codes { // non-Euclidean methods METHOD_METR_SINGLE = 0, METHOD_METR_COMPLETE = 1, METHOD_METR_AVERAGE = 2, METHOD_METR_WEIGHTED = 3, METHOD_METR_WARD = 4, METHOD_METR_WARD_D = METHOD_METR_WARD, METHOD_METR_CENTROID = 5, METHOD_METR_MEDIAN = 6, METHOD_METR_WARD_D2 = 7, MIN_METHOD_CODE = 0, MAX_METHOD_CODE = 7 }; enum method_codes_vector { // Euclidean methods METHOD_VECTOR_SINGLE = 0, METHOD_VECTOR_WARD = 1, METHOD_VECTOR_CENTROID = 2, METHOD_VECTOR_MEDIAN = 3, MIN_METHOD_VECTOR_CODE = 0, MAX_METHOD_VECTOR_CODE = 3 }; // self-destructing array pointer template class auto_array_ptr{ private: type * ptr; auto_array_ptr(auto_array_ptr const &); // non construction-copyable auto_array_ptr& operator=(auto_array_ptr const &); // non copyable public: auto_array_ptr() : ptr(NULL) { } template auto_array_ptr(index const size) : ptr(new type[size]) { } template auto_array_ptr(index const size, value const val) : ptr(new type[size]) { FILL_N(ptr, size, val); } ~auto_array_ptr() { delete [] ptr; } void free() { delete [] ptr; ptr = NULL; } template void init(index const size) { ptr = new type [size]; } template void init(index const size, value const val) { init(size); FILL_N(ptr, size, val); } inline operator type *() const { return ptr; } }; struct node { t_index node1, node2; t_float dist; }; inline bool operator< (const node a, const node b) { return (a.dist < b.dist); } class cluster_result { private: auto_array_ptr Z; t_index pos; public: cluster_result(const t_index size) : Z(size) , pos(0) {} void append(const t_index node1, const t_index node2, const t_float dist) { Z[pos].node1 = node1; Z[pos].node2 = node2; Z[pos].dist = dist; ++pos; } node * operator[] (const t_index idx) const { return Z + idx; } /* Define several methods to postprocess the distances. All these functions are monotone, so they do not change the sorted order of distances. */ void sqrt() const { for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist = std::sqrt(ZZ->dist); } } void sqrt(const t_float) const { // ignore the argument sqrt(); } void sqrtdouble(const t_float) const { // ignore the argument for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist = std::sqrt(2*ZZ->dist); } } #ifdef R_pow #define my_pow R_pow #else #define my_pow std::pow #endif void power(const t_float p) const { t_float const q = 1/p; for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist = my_pow(ZZ->dist,q); } } void plusone(const t_float) const { // ignore the argument for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist += 1; } } void divide(const t_float denom) const { for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist /= denom; } } }; class doubly_linked_list { /* Class for a doubly linked list. Initially, the list is the integer range [0, size]. We provide a forward iterator and a method to delete an index from the list. Typical use: for (i=L.start; L succ; private: auto_array_ptr pred; // Not necessarily private, we just do not need it in this instance. public: doubly_linked_list(const t_index size) // Initialize to the given size. : start(0) , succ(size+1) , pred(size+1) { for (t_index i=0; i(2*N-3-(r_))*(r_)>>1)+(c_)-1] ) // Z is an ((N-1)x4)-array #define Z_(_r, _c) (Z[(_r)*4 + (_c)]) /* Lookup function for a union-find data structure. The function finds the root of idx by going iteratively through all parent elements until a root is found. An element i is a root if nodes[i] is zero. To make subsequent searches faster, the entry for idx and all its parents is updated with the root element. */ class union_find { private: auto_array_ptr parent; t_index nextparent; public: union_find(const t_index size) : parent(size>0 ? 2*size-1 : 0, 0) , nextparent(size) { } t_index Find (t_index idx) const { if (parent[idx] != 0 ) { // a → b t_index p = idx; idx = parent[idx]; if (parent[idx] != 0 ) { // a → b → c do { idx = parent[idx]; } while (parent[idx] != 0); do { t_index tmp = parent[p]; parent[p] = idx; p = tmp; } while (parent[p] != idx); } } return idx; } void Union (const t_index node1, const t_index node2) { parent[node1] = parent[node2] = nextparent++; } }; class nan_error{}; #ifdef FE_INVALID class fenv_error{}; #endif static void MST_linkage_core(const t_index N, const t_float * const D, cluster_result & Z2) { /* N: integer, number of data points D: condensed distance matrix N*(N-1)/2 Z2: output data structure The basis of this algorithm is an algorithm by Rohlf: F. James Rohlf, Hierarchical clustering using the minimum spanning tree, The Computer Journal, vol. 16, 1973, p. 93–95. */ t_index i; t_index idx2; doubly_linked_list active_nodes(N); auto_array_ptr d(N); t_index prev_node; t_float min; // first iteration idx2 = 1; min = std::numeric_limits::infinity(); for (i=1; i tmp) d[i] = tmp; else if (fc_isnan(tmp)) throw (nan_error()); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (d[i] < min) { min = d[i]; idx2 = i; } } Z2.append(prev_node, idx2, min); } } /* Functions for the update of the dissimilarity array */ inline static void f_single( t_float * const b, const t_float a ) { if (*b > a) *b = a; } inline static void f_complete( t_float * const b, const t_float a ) { if (*b < a) *b = a; } inline static void f_average( t_float * const b, const t_float a, const t_float s, const t_float t) { *b = s*a + t*(*b); #ifndef FE_INVALID #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } inline static void f_weighted( t_float * const b, const t_float a) { *b = (a+*b)*.5; #ifndef FE_INVALID #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } inline static void f_ward( t_float * const b, const t_float a, const t_float c, const t_float s, const t_float t, const t_float v) { *b = ( (v+s)*a - v*c + (v+t)*(*b) ) / (s+t+v); //*b = a+(*b)-(t*a+s*(*b)+v*c)/(s+t+v); #ifndef FE_INVALID #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } inline static void f_centroid( t_float * const b, const t_float a, const t_float stc, const t_float s, const t_float t) { *b = s*a - stc + t*(*b); #ifndef FE_INVALID if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } inline static void f_median( t_float * const b, const t_float a, const t_float c_4) { *b = (a+(*b))*.5 - c_4; #ifndef FE_INVALID #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } template static void NN_chain_core(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) { /* N: integer D: condensed distance matrix N*(N-1)/2 Z2: output data structure This is the NN-chain algorithm, described on page 86 in the following book: Fionn Murtagh, Multidimensional Clustering Algorithms, Vienna, Würzburg: Physica-Verlag, 1985. */ t_index i; auto_array_ptr NN_chain(N); t_index NN_chain_tip = 0; t_index idx1, idx2; t_float size1, size2; doubly_linked_list active_nodes(N); t_float min; for (t_float const * DD=D; DD!=D+(static_cast(N)*(N-1)>>1); ++DD) { #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*DD)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } #ifdef FE_INVALID if (feclearexcept(FE_INVALID)) throw fenv_error(); #endif for (t_index j=0; jidx2) { t_index tmp = idx1; idx1 = idx2; idx2 = tmp; } if (method==METHOD_METR_AVERAGE || method==METHOD_METR_WARD) { size1 = static_cast(members[idx1]); size2 = static_cast(members[idx2]); members[idx2] += members[idx1]; } // Remove the smaller index from the valid indices (active_nodes). active_nodes.remove(idx1); switch (method) { case METHOD_METR_SINGLE: /* Single linkage. Characteristic: new distances are never longer than the old distances. */ // Update the distance matrix in the range [start, idx1). for (i=active_nodes.start; i(members[i]); for (i=active_nodes.start; i(members[i]) ); // Update the distance matrix in the range (idx1, idx2). for (; i(members[i]) ); // Update the distance matrix in the range (idx2, N). for (i=active_nodes.succ[idx2]; i(members[i]) ); break; default: throw std::runtime_error(std::string("Invalid method.")); } } #ifdef FE_INVALID if (fetestexcept(FE_INVALID)) throw fenv_error(); #endif } class binary_min_heap { /* Class for a binary min-heap. The data resides in an array A. The elements of A are not changed but two lists I and R of indices are generated which point to elements of A and backwards. The heap tree structure is H[2*i+1] H[2*i+2] \ / \ / ≤ ≤ \ / \ / H[i] where the children must be less or equal than their parent. Thus, H[0] contains the minimum. The lists I and R are made such that H[i] = A[I[i]] and R[I[i]] = i. This implementation is not designed to handle NaN values. */ private: t_float * const A; t_index size; auto_array_ptr I; auto_array_ptr R; // no default constructor binary_min_heap(); // noncopyable binary_min_heap(binary_min_heap const &); binary_min_heap & operator=(binary_min_heap const &); public: binary_min_heap(t_float * const A_, const t_index size_) : A(A_), size(size_), I(size), R(size) { // Allocate memory and initialize the lists I and R to the identity. This // does not make it a heap. Call heapify afterwards! for (t_index i=0; i>1); idx>0; ) { --idx; update_geq_(idx); } } inline t_index argmin() const { // Return the minimal element. return I[0]; } void heap_pop() { // Remove the minimal element from the heap. --size; I[0] = I[size]; R[I[0]] = 0; update_geq_(0); } void remove(t_index idx) { // Remove an element from the heap. --size; R[I[size]] = R[idx]; I[R[idx]] = I[size]; if ( H(size)<=A[idx] ) { update_leq_(R[idx]); } else { update_geq_(R[idx]); } } void replace ( const t_index idxold, const t_index idxnew, const t_float val) { R[idxnew] = R[idxold]; I[R[idxnew]] = idxnew; if (val<=A[idxold]) update_leq(idxnew, val); else update_geq(idxnew, val); } void update ( const t_index idx, const t_float val ) const { // Update the element A[i] with val and re-arrange the indices to preserve // the heap condition. if (val<=A[idx]) update_leq(idx, val); else update_geq(idx, val); } void update_leq ( const t_index idx, const t_float val ) const { // Use this when the new value is not more than the old value. A[idx] = val; update_leq_(R[idx]); } void update_geq ( const t_index idx, const t_float val ) const { // Use this when the new value is not less than the old value. A[idx] = val; update_geq_(R[idx]); } private: void update_leq_ (t_index i) const { t_index j; for ( ; (i>0) && ( H(i)>1) ); i=j) heap_swap(i,j); } void update_geq_ (t_index i) const { t_index j; for ( ; (j=2*i+1)=H(i) ) { ++j; if ( j>=size || H(j)>=H(i) ) break; } else if ( j+1 static void generic_linkage(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) { /* N: integer, number of data points D: condensed distance matrix N*(N-1)/2 Z2: output data structure */ const t_index N_1 = N-1; t_index i, j; // loop variables t_index idx1, idx2; // row and column indices auto_array_ptr n_nghbr(N_1); // array of nearest neighbors auto_array_ptr mindist(N_1); // distances to the nearest neighbors auto_array_ptr row_repr(N); // row_repr[i]: node number that the // i-th row represents doubly_linked_list active_nodes(N); binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for // the distance to the nearest neighbor of each point t_index node1, node2; // node numbers in the output t_float size1, size2; // and their cardinalities t_float min; // minimum and row index for nearest-neighbor search t_index idx; for (i=0; ii} D(i,j) for i in range(N-1) t_float const * DD = D; for (i=0; i::infinity(); for (idx=j=i+1; ji} D(i,j) Normally, we have equality. However, this minimum may become invalid due to the updates in the distance matrix. The rules are: 1) If mindist[i] is equal to D(i, n_nghbr[i]), this is the correct minimum and n_nghbr[i] is a nearest neighbor. 2) If mindist[i] is smaller than D(i, n_nghbr[i]), this might not be the correct minimum. The minimum needs to be recomputed. 3) mindist[i] is never bigger than the true minimum. Hence, we never miss the true minimum if we take the smallest mindist entry, re-compute the value if necessary (thus maybe increasing it) and looking for the now smallest mindist entry until a valid minimal entry is found. This step is done in the lines below. The update process for D below takes care that these rules are fulfilled. This makes sure that the minima in the rows D(i,i+1:)of D are re-calculated when necessary but re-calculation is avoided whenever possible. The re-calculation of the minima makes the worst-case runtime of this algorithm cubic in N. We avoid this whenever possible, and in most cases the runtime appears to be quadratic. */ idx1 = nn_distances.argmin(); if (method != METHOD_METR_SINGLE) { while ( mindist[idx1] < D_(idx1, n_nghbr[idx1]) ) { // Recompute the minimum mindist[idx1] and n_nghbr[idx1]. n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1 min = D_(idx1,j); for (j=active_nodes.succ[j]; j(members[idx1]); size2 = static_cast(members[idx2]); members[idx2] += members[idx1]; } Z2.append(node1, node2, mindist[idx1]); // Remove idx1 from the list of active indices (active_nodes). active_nodes.remove(idx1); // Index idx2 now represents the new (merged) node with label N+i. row_repr[idx2] = N+i; // Update the distance matrix switch (method) { case METHOD_METR_SINGLE: /* Single linkage. Characteristic: new distances are never longer than the old distances. */ // Update the distance matrix in the range [start, idx1). for (j=active_nodes.start; j(members[j]) ); if (n_nghbr[j] == idx1) n_nghbr[j] = idx2; } // Update the distance matrix in the range (idx1, idx2). for (; j(members[j]) ); if (D_(j, idx2) < mindist[j]) { nn_distances.update_leq(j, D_(j, idx2)); n_nghbr[j] = idx2; } } // Update the distance matrix in the range (idx2, N). if (idx2(members[j]) ); min = D_(idx2,j); for (j=active_nodes.succ[j]; j(members[j]) ); if (D_(idx2,j) < min) { min = D_(idx2,j); n_nghbr[idx2] = j; } } nn_distances.update(idx2, min); } break; case METHOD_METR_CENTROID: { /* Centroid linkage. Shorter and longer distances can occur, not bigger than max(d1,d2) but maybe smaller than min(d1,d2). */ // Update the distance matrix in the range [start, idx1). t_float s = size1/(size1+size2); t_float t = size2/(size1+size2); t_float stc = s*t*mindist[idx1]; for (j=active_nodes.start; j static void MST_linkage_core_vector(const t_index N, t_dissimilarity & dist, cluster_result & Z2) { /* N: integer, number of data points dist: function pointer to the metric Z2: output data structure The basis of this algorithm is an algorithm by Rohlf: F. James Rohlf, Hierarchical clustering using the minimum spanning tree, The Computer Journal, vol. 16, 1973, p. 93–95. */ t_index i; t_index idx2; doubly_linked_list active_nodes(N); auto_array_ptr d(N); t_index prev_node; t_float min; // first iteration idx2 = 1; min = std::numeric_limits::infinity(); for (i=1; i tmp) d[i] = tmp; else if (fc_isnan(tmp)) throw (nan_error()); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (d[i] < min) { min = d[i]; idx2 = i; } } Z2.append(prev_node, idx2, min); } } template static void generic_linkage_vector(const t_index N, t_dissimilarity & dist, cluster_result & Z2) { /* N: integer, number of data points dist: function pointer to the metric Z2: output data structure This algorithm is valid for the distance update methods "Ward", "centroid" and "median" only! */ const t_index N_1 = N-1; t_index i, j; // loop variables t_index idx1, idx2; // row and column indices auto_array_ptr n_nghbr(N_1); // array of nearest neighbors auto_array_ptr mindist(N_1); // distances to the nearest neighbors auto_array_ptr row_repr(N); // row_repr[i]: node number that the // i-th row represents doubly_linked_list active_nodes(N); binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for // the distance to the nearest neighbor of each point t_index node1, node2; // node numbers in the output t_float min; // minimum and row index for nearest-neighbor search for (i=0; ii} D(i,j) for i in range(N-1) for (i=0; i::infinity(); t_index idx; for (idx=j=i+1; j(i,j); } if (tmp(idx1,j); for (j=active_nodes.succ[j]; j(idx1,j); if (tmp(j, idx2); if (tmp < mindist[j]) { nn_distances.update_leq(j, tmp); n_nghbr[j] = idx2; } else if (n_nghbr[j] == idx2) n_nghbr[j] = idx1; // invalidate } // Find the nearest neighbor for idx2. if (idx2(idx2,j); for (j=active_nodes.succ[j]; j(idx2, j); if (tmp < min) { min = tmp; n_nghbr[idx2] = j; } } nn_distances.update(idx2, min); } } } } template static void generic_linkage_vector_alternative(const t_index N, t_dissimilarity & dist, cluster_result & Z2) { /* N: integer, number of data points dist: function pointer to the metric Z2: output data structure This algorithm is valid for the distance update methods "Ward", "centroid" and "median" only! */ const t_index N_1 = N-1; t_index i, j=0; // loop variables t_index idx1, idx2; // row and column indices auto_array_ptr n_nghbr(2*N-2); // array of nearest neighbors auto_array_ptr mindist(2*N-2); // distances to the nearest neighbors doubly_linked_list active_nodes(N+N_1); binary_min_heap nn_distances(&*mindist, N_1, 2*N-2, 1); // minimum heap // structure for the distance to the nearest neighbor of each point t_float min; // minimum for nearest-neighbor searches // Initialize the minimal distances: // Find the nearest neighbor of each point. // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1) for (i=1; i::infinity(); t_index idx; for (idx=j=0; j(i,j); } if (tmp