/*
fastcluster : Fast hierarchical clustering routines for R and Python
Copyright © 2011 Daniel Müllner
< http : //danifold.net>
This library implements various fast algorithms for hierarchical ,
agglomerative clustering methods :
( 1 ) Algorithms for the " stored matrix approach " : the input is the array of
pairwise dissimilarities .
MST_linkage_core : single linkage clustering with the " minimum spanning
tree algorithm ( Rohlfs )
NN_chain_core : nearest - neighbor - chain algorithm , suitable for single ,
complete , average , weighted and Ward linkage ( Murtagh )
generic_linkage : generic algorithm , suitable for all distance update
formulas ( Müllner )
( 2 ) Algorithms for the " stored data approach " : the input are points in a
vector space .
MST_linkage_core_vector : single linkage clustering for vector data
generic_linkage_vector : generic algorithm for vector data , suitable for
the Ward , centroid and median methods .
generic_linkage_vector_alternative : alternative scheme for updating the
nearest neighbors . This method seems faster than " generic_linkage_vector "
for the centroid and median methods but slower for the Ward method .
All these implementation treat infinity values correctly . They throw an
exception if a NaN distance value occurs .
*/
// Older versions of Microsoft Visual Studio do not have the fenv header.
# ifdef _MSC_VER
# if (_MSC_VER == 1500 || _MSC_VER == 1600)
# define NO_INCLUDE_FENV
# endif
# endif
// NaN detection via fenv might not work on systems with software
// floating-point emulation (bug report for Debian armel).
# ifdef __SOFTFP__
# define NO_INCLUDE_FENV
# endif
# ifdef NO_INCLUDE_FENV
# pragma message("Do not use fenv header.")
# else
//#pragma message("Use fenv header. If there is a warning about unknown #pragma STDC FENV_ACCESS, this can be ignored.")
//#pragma STDC FENV_ACCESS on
# include <fenv.h>
# endif
# include <cmath> // for std::pow, std::sqrt
# include <cstddef> // for std::ptrdiff_t
# include <limits> // for std::numeric_limits<...>::infinity()
# include <algorithm> // for std::fill_n
# include <stdexcept> // for std::runtime_error
# include <string> // for std::string
# include <cfloat> // also for DBL_MAX, DBL_MIN
# ifndef DBL_MANT_DIG
# error The constant DBL_MANT_DIG could not be defined.
# endif
# define T_FLOAT_MANT_DIG DBL_MANT_DIG
# ifndef LONG_MAX
# include <climits>
# endif
# ifndef LONG_MAX
# error The constant LONG_MAX could not be defined.
# endif
# ifndef INT_MAX
# error The constant INT_MAX could not be defined.
# endif
# ifndef INT32_MAX
# ifdef _MSC_VER
# if _MSC_VER >= 1600
# define __STDC_LIMIT_MACROS
# include <stdint.h>
# else
typedef __int32 int_fast32_t ;
typedef __int64 int64_t ;
# endif
# else
# define __STDC_LIMIT_MACROS
# include <stdint.h>
# endif
# endif
# define FILL_N std::fill_n
# ifdef _MSC_VER
# if _MSC_VER < 1600
# undef FILL_N
# define FILL_N stdext::unchecked_fill_n
# endif
# endif
// Suppress warnings about (potentially) uninitialized variables.
# ifdef _MSC_VER
# pragma warning (disable:4700)
# endif
# ifndef HAVE_DIAGNOSTIC
# if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6))
# define HAVE_DIAGNOSTIC 1
# endif
# endif
# ifndef HAVE_VISIBILITY
# if __GNUC__ >= 4
# define HAVE_VISIBILITY 1
# endif
# endif
/* Since the public interface is given by the Python respectively R interface,
* we do not want other symbols than the interface initalization routines to be
* visible in the shared object file . The " visibility " switch is a GCC concept .
* Hiding symbols keeps the relocation table small and decreases startup time .
* See http : //gcc.gnu.org/wiki/Visibility
*/
# if HAVE_VISIBILITY
# pragma GCC visibility push(hidden)
# endif
typedef int_fast32_t t_index ;
# ifndef INT32_MAX
# define MAX_INDEX 0x7fffffffL
# else
# define MAX_INDEX INT32_MAX
# endif
# if (LONG_MAX < MAX_INDEX)
# error The integer format "t_index" must not have a greater range than "long int".
# endif
# if (INT_MAX > MAX_INDEX)
# error The integer format "int" must not have a greater range than "t_index".
# endif
typedef double t_float ;
/* Method codes.
These codes must agree with the METHODS array in fastcluster . R and the
dictionary mthidx in fastcluster . py .
*/
enum method_codes {
// non-Euclidean methods
METHOD_METR_SINGLE = 0 ,
METHOD_METR_COMPLETE = 1 ,
METHOD_METR_AVERAGE = 2 ,
METHOD_METR_WEIGHTED = 3 ,
METHOD_METR_WARD = 4 ,
METHOD_METR_WARD_D = METHOD_METR_WARD ,
METHOD_METR_CENTROID = 5 ,
METHOD_METR_MEDIAN = 6 ,
METHOD_METR_WARD_D2 = 7 ,
MIN_METHOD_CODE = 0 ,
MAX_METHOD_CODE = 7
} ;
enum method_codes_vector {
// Euclidean methods
METHOD_VECTOR_SINGLE = 0 ,
METHOD_VECTOR_WARD = 1 ,
METHOD_VECTOR_CENTROID = 2 ,
METHOD_VECTOR_MEDIAN = 3 ,
MIN_METHOD_VECTOR_CODE = 0 ,
MAX_METHOD_VECTOR_CODE = 3
} ;
// self-destructing array pointer
template < typename type >
class auto_array_ptr {
private :
type * ptr ;
auto_array_ptr ( auto_array_ptr const & ) ; // non construction-copyable
auto_array_ptr & operator = ( auto_array_ptr const & ) ; // non copyable
public :
auto_array_ptr ( )
: ptr ( NULL )
{ }
template < typename index >
auto_array_ptr ( index const size )
: ptr ( new type [ size ] )
{ }
template < typename index , typename value >
auto_array_ptr ( index const size , value const val )
: ptr ( new type [ size ] )
{
FILL_N ( ptr , size , val ) ;
}
~ auto_array_ptr ( ) {
delete [ ] ptr ; }
void free ( ) {
delete [ ] ptr ;
ptr = NULL ;
}
template < typename index >
void init ( index const size ) {
ptr = new type [ size ] ;
}
template < typename index , typename value >
void init ( index const size , value const val ) {
init ( size ) ;
FILL_N ( ptr , size , val ) ;
}
inline operator type * ( ) const { return ptr ; }
} ;
struct node {
t_index node1 , node2 ;
t_float dist ;
} ;
inline bool operator < ( const node a , const node b ) {
return ( a . dist < b . dist ) ;
}
class cluster_result {
private :
auto_array_ptr < node > Z ;
t_index pos ;
public :
cluster_result ( const t_index size )
: Z ( size )
, pos ( 0 )
{ }
void append ( const t_index node1 , const t_index node2 , const t_float dist ) {
Z [ pos ] . node1 = node1 ;
Z [ pos ] . node2 = node2 ;
Z [ pos ] . dist = dist ;
+ + pos ;
}
node * operator [ ] ( const t_index idx ) const { return Z + idx ; }
/* Define several methods to postprocess the distances. All these functions
are monotone , so they do not change the sorted order of distances . */
void sqrt ( ) const {
for ( node * ZZ = Z ; ZZ ! = Z + pos ; + + ZZ ) {
ZZ - > dist = std : : sqrt ( ZZ - > dist ) ;
}
}
void sqrt ( const t_float ) const { // ignore the argument
sqrt ( ) ;
}
void sqrtdouble ( const t_float ) const { // ignore the argument
for ( node * ZZ = Z ; ZZ ! = Z + pos ; + + ZZ ) {
ZZ - > dist = std : : sqrt ( 2 * ZZ - > dist ) ;
}
}
# ifdef R_pow
# define my_pow R_pow
# else
# define my_pow std::pow
# endif
void power ( const t_float p ) const {
t_float const q = 1 / p ;
for ( node * ZZ = Z ; ZZ ! = Z + pos ; + + ZZ ) {
ZZ - > dist = my_pow ( ZZ - > dist , q ) ;
}
}
void plusone ( const t_float ) const { // ignore the argument
for ( node * ZZ = Z ; ZZ ! = Z + pos ; + + ZZ ) {
ZZ - > dist + = 1 ;
}
}
void divide ( const t_float denom ) const {
for ( node * ZZ = Z ; ZZ ! = Z + pos ; + + ZZ ) {
ZZ - > dist / = denom ;
}
}
} ;
class doubly_linked_list {
/*
Class for a doubly linked list . Initially , the list is the integer range
[ 0 , size ] . We provide a forward iterator and a method to delete an index
from the list .
Typical use : for ( i = L . start ; L < size ; i = L . succ [ I ] )
or
for ( i = somevalue ; L < size ; i = L . succ [ I ] )
*/
public :
t_index start ;
auto_array_ptr < t_index > succ ;
private :
auto_array_ptr < t_index > pred ;
// Not necessarily private, we just do not need it in this instance.
public :
doubly_linked_list ( const t_index size )
// Initialize to the given size.
: start ( 0 )
, succ ( size + 1 )
, pred ( size + 1 )
{
for ( t_index i = 0 ; i < size ; + + i ) {
pred [ i + 1 ] = i ;
succ [ i ] = i + 1 ;
}
// pred[0] is never accessed!
//succ[size] is never accessed!
}
~ doubly_linked_list ( ) { }
void remove ( const t_index idx ) {
// Remove an index from the list.
if ( idx = = start ) {
start = succ [ idx ] ;
}
else {
succ [ pred [ idx ] ] = succ [ idx ] ;
pred [ succ [ idx ] ] = pred [ idx ] ;
}
succ [ idx ] = 0 ; // Mark as inactive
}
bool is_inactive ( t_index idx ) const {
return ( succ [ idx ] = = 0 ) ;
}
} ;
// Indexing functions
// D is the upper triangular part of a symmetric (NxN)-matrix
// We require r_ < c_ !
# define D_(r_,c_) ( D[(static_cast<std::ptrdiff_t>(2*N-3-(r_))*(r_)>>1)+(c_)-1] )
// Z is an ((N-1)x4)-array
# define Z_(_r, _c) (Z[(_r)*4 + (_c)])
/*
Lookup function for a union - find data structure .
The function finds the root of idx by going iteratively through all
parent elements until a root is found . An element i is a root if
nodes [ i ] is zero . To make subsequent searches faster , the entry for
idx and all its parents is updated with the root element .
*/
class union_find {
private :
auto_array_ptr < t_index > parent ;
t_index nextparent ;
public :
union_find ( const t_index size )
: parent ( size > 0 ? 2 * size - 1 : 0 , 0 )
, nextparent ( size )
{ }
t_index Find ( t_index idx ) const {
if ( parent [ idx ] ! = 0 ) { // a → b
t_index p = idx ;
idx = parent [ idx ] ;
if ( parent [ idx ] ! = 0 ) { // a → b → c
do {
idx = parent [ idx ] ;
} while ( parent [ idx ] ! = 0 ) ;
do {
t_index tmp = parent [ p ] ;
parent [ p ] = idx ;
p = tmp ;
} while ( parent [ p ] ! = idx ) ;
}
}
return idx ;
}
void Union ( const t_index node1 , const t_index node2 ) {
parent [ node1 ] = parent [ node2 ] = nextparent + + ;
}
} ;
class nan_error { } ;
# ifdef FE_INVALID
class fenv_error { } ;
# endif
static void MST_linkage_core ( const t_index N , const t_float * const D ,
cluster_result & Z2 ) {
/*
N : integer , number of data points
D : condensed distance matrix N * ( N - 1 ) / 2
Z2 : output data structure
The basis of this algorithm is an algorithm by Rohlf :
F . James Rohlf , Hierarchical clustering using the minimum spanning tree ,
The Computer Journal , vol . 16 , 1973 , p . 93 – 95.
*/
t_index i ;
t_index idx2 ;
doubly_linked_list active_nodes ( N ) ;
auto_array_ptr < t_float > d ( N ) ;
t_index prev_node ;
t_float min ;
// first iteration
idx2 = 1 ;
min = std : : numeric_limits < t_float > : : infinity ( ) ;
for ( i = 1 ; i < N ; + + i ) {
d [ i ] = D [ i - 1 ] ;
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wfloat-equal"
# endif
if ( d [ i ] < min ) {
min = d [ i ] ;
idx2 = i ;
}
else if ( fc_isnan ( d [ i ] ) )
throw ( nan_error ( ) ) ;
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic pop
# endif
}
Z2 . append ( 0 , idx2 , min ) ;
for ( t_index j = 1 ; j < N - 1 ; + + j ) {
prev_node = idx2 ;
active_nodes . remove ( prev_node ) ;
idx2 = active_nodes . succ [ 0 ] ;
min = d [ idx2 ] ;
for ( i = idx2 ; i < prev_node ; i = active_nodes . succ [ i ] ) {
t_float tmp = D_ ( i , prev_node ) ;
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wfloat-equal"
# endif
if ( tmp < d [ i ] )
d [ i ] = tmp ;
else if ( fc_isnan ( tmp ) )
throw ( nan_error ( ) ) ;
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic pop
# endif
if ( d [ i ] < min ) {
min = d [ i ] ;
idx2 = i ;
}
}
for ( ; i < N ; i = active_nodes . succ [ i ] ) {
t_float tmp = D_ ( prev_node , i ) ;
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wfloat-equal"
# endif
if ( d [ i ] > tmp )
d [ i ] = tmp ;
else if ( fc_isnan ( tmp ) )
throw ( nan_error ( ) ) ;
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic pop
# endif
if ( d [ i ] < min ) {
min = d [ i ] ;
idx2 = i ;
}
}
Z2 . append ( prev_node , idx2 , min ) ;
}
}
/* Functions for the update of the dissimilarity array */
inline static void f_single ( t_float * const b , const t_float a ) {
if ( * b > a ) * b = a ;
}
inline static void f_complete ( t_float * const b , const t_float a ) {
if ( * b < a ) * b = a ;
}
inline static void f_average ( t_float * const b , const t_float a , const t_float s , const t_float t ) {
* b = s * a + t * ( * b ) ;
# ifndef FE_INVALID
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wfloat-equal"
# endif
if ( fc_isnan ( * b ) ) {
throw ( nan_error ( ) ) ;
}
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic pop
# endif
# endif
}
inline static void f_weighted ( t_float * const b , const t_float a ) {
* b = ( a + * b ) * .5 ;
# ifndef FE_INVALID
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wfloat-equal"
# endif
if ( fc_isnan ( * b ) ) {
throw ( nan_error ( ) ) ;
}
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic pop
# endif
# endif
}
inline static void f_ward ( t_float * const b , const t_float a , const t_float c , const t_float s , const t_float t , const t_float v ) {
* b = ( ( v + s ) * a - v * c + ( v + t ) * ( * b ) ) / ( s + t + v ) ;
//*b = a+(*b)-(t*a+s*(*b)+v*c)/(s+t+v);
# ifndef FE_INVALID
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wfloat-equal"
# endif
if ( fc_isnan ( * b ) ) {
throw ( nan_error ( ) ) ;
}
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic pop
# endif
# endif
}
inline static void f_centroid ( t_float * const b , const t_float a , const t_float stc , const t_float s , const t_float t ) {
* b = s * a - stc + t * ( * b ) ;
# ifndef FE_INVALID
if ( fc_isnan ( * b ) ) {
throw ( nan_error ( ) ) ;
}
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic pop
# endif
# endif
}
inline static void f_median ( t_float * const b , const t_float a , const t_float c_4 ) {
* b = ( a + ( * b ) ) * .5 - c_4 ;
# ifndef FE_INVALID
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wfloat-equal"
# endif
if ( fc_isnan ( * b ) ) {
throw ( nan_error ( ) ) ;
}
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic pop
# endif
# endif
}
template < method_codes method , typename t_members >
static void NN_chain_core ( const t_index N , t_float * const D , t_members * const members , cluster_result & Z2 ) {
/*
N : integer
D : condensed distance matrix N * ( N - 1 ) / 2
Z2 : output data structure
This is the NN - chain algorithm , described on page 86 in the following book :
Fionn Murtagh , Multidimensional Clustering Algorithms ,
Vienna , Würzburg : Physica - Verlag , 1985.
*/
t_index i ;
auto_array_ptr < t_index > NN_chain ( N ) ;
t_index NN_chain_tip = 0 ;
t_index idx1 , idx2 ;
t_float size1 , size2 ;
doubly_linked_list active_nodes ( N ) ;
t_float min ;
for ( t_float const * DD = D ; DD ! = D + ( static_cast < std : : ptrdiff_t > ( N ) * ( N - 1 ) > > 1 ) ;
+ + DD ) {
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wfloat-equal"
# endif
if ( fc_isnan ( * DD ) ) {
throw ( nan_error ( ) ) ;
}
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic pop
# endif
}
# ifdef FE_INVALID
if ( feclearexcept ( FE_INVALID ) ) throw fenv_error ( ) ;
# endif
for ( t_index j = 0 ; j < N - 1 ; + + j ) {
if ( NN_chain_tip < = 3 ) {
NN_chain [ 0 ] = idx1 = active_nodes . start ;
NN_chain_tip = 1 ;
idx2 = active_nodes . succ [ idx1 ] ;
min = D_ ( idx1 , idx2 ) ;
for ( i = active_nodes . succ [ idx2 ] ; i < N ; i = active_nodes . succ [ i ] ) {
if ( D_ ( idx1 , i ) < min ) {
min = D_ ( idx1 , i ) ;
idx2 = i ;
}
}
} // a: idx1 b: idx2
else {
NN_chain_tip - = 3 ;
idx1 = NN_chain [ NN_chain_tip - 1 ] ;
idx2 = NN_chain [ NN_chain_tip ] ;
min = idx1 < idx2 ? D_ ( idx1 , idx2 ) : D_ ( idx2 , idx1 ) ;
} // a: idx1 b: idx2
do {
NN_chain [ NN_chain_tip ] = idx2 ;
for ( i = active_nodes . start ; i < idx2 ; i = active_nodes . succ [ i ] ) {
if ( D_ ( i , idx2 ) < min ) {
min = D_ ( i , idx2 ) ;
idx1 = i ;
}
}
for ( i = active_nodes . succ [ idx2 ] ; i < N ; i = active_nodes . succ [ i ] ) {
if ( D_ ( idx2 , i ) < min ) {
min = D_ ( idx2 , i ) ;
idx1 = i ;
}
}
idx2 = idx1 ;
idx1 = NN_chain [ NN_chain_tip + + ] ;
} while ( idx2 ! = NN_chain [ NN_chain_tip - 2 ] ) ;
Z2 . append ( idx1 , idx2 , min ) ;
if ( idx1 > idx2 ) {
t_index tmp = idx1 ;
idx1 = idx2 ;
idx2 = tmp ;
}
if ( method = = METHOD_METR_AVERAGE | |
method = = METHOD_METR_WARD ) {
size1 = static_cast < t_float > ( members [ idx1 ] ) ;
size2 = static_cast < t_float > ( members [ idx2 ] ) ;
members [ idx2 ] + = members [ idx1 ] ;
}
// Remove the smaller index from the valid indices (active_nodes).
active_nodes . remove ( idx1 ) ;
switch ( method ) {
case METHOD_METR_SINGLE :
/*
Single linkage .
Characteristic : new distances are never longer than the old distances .
*/
// Update the distance matrix in the range [start, idx1).
for ( i = active_nodes . start ; i < idx1 ; i = active_nodes . succ [ i ] )
f_single ( & D_ ( i , idx2 ) , D_ ( i , idx1 ) ) ;
// Update the distance matrix in the range (idx1, idx2).
for ( ; i < idx2 ; i = active_nodes . succ [ i ] )
f_single ( & D_ ( i , idx2 ) , D_ ( idx1 , i ) ) ;
// Update the distance matrix in the range (idx2, N).
for ( i = active_nodes . succ [ idx2 ] ; i < N ; i = active_nodes . succ [ i ] )
f_single ( & D_ ( idx2 , i ) , D_ ( idx1 , i ) ) ;
break ;
case METHOD_METR_COMPLETE :
/*
Complete linkage .
Characteristic : new distances are never shorter than the old distances .
*/
// Update the distance matrix in the range [start, idx1).
for ( i = active_nodes . start ; i < idx1 ; i = active_nodes . succ [ i ] )
f_complete ( & D_ ( i , idx2 ) , D_ ( i , idx1 ) ) ;
// Update the distance matrix in the range (idx1, idx2).
for ( ; i < idx2 ; i = active_nodes . succ [ i ] )
f_complete ( & D_ ( i , idx2 ) , D_ ( idx1 , i ) ) ;
// Update the distance matrix in the range (idx2, N).
for ( i = active_nodes . succ [ idx2 ] ; i < N ; i = active_nodes . succ [ i ] )
f_complete ( & D_ ( idx2 , i ) , D_ ( idx1 , i ) ) ;
break ;
case METHOD_METR_AVERAGE : {
/*
Average linkage .
Shorter and longer distances can occur .
*/
// Update the distance matrix in the range [start, idx1).
t_float s = size1 / ( size1 + size2 ) ;
t_float t = size2 / ( size1 + size2 ) ;
for ( i = active_nodes . start ; i < idx1 ; i = active_nodes . succ [ i ] )
f_average ( & D_ ( i , idx2 ) , D_ ( i , idx1 ) , s , t ) ;
// Update the distance matrix in the range (idx1, idx2).
for ( ; i < idx2 ; i = active_nodes . succ [ i ] )
f_average ( & D_ ( i , idx2 ) , D_ ( idx1 , i ) , s , t ) ;
// Update the distance matrix in the range (idx2, N).
for ( i = active_nodes . succ [ idx2 ] ; i < N ; i = active_nodes . succ [ i ] )
f_average ( & D_ ( idx2 , i ) , D_ ( idx1 , i ) , s , t ) ;
break ;
}
case METHOD_METR_WEIGHTED :
/*
Weighted linkage .
Shorter and longer distances can occur .
*/
// Update the distance matrix in the range [start, idx1).
for ( i = active_nodes . start ; i < idx1 ; i = active_nodes . succ [ i ] )
f_weighted ( & D_ ( i , idx2 ) , D_ ( i , idx1 ) ) ;
// Update the distance matrix in the range (idx1, idx2).
for ( ; i < idx2 ; i = active_nodes . succ [ i ] )
f_weighted ( & D_ ( i , idx2 ) , D_ ( idx1 , i ) ) ;
// Update the distance matrix in the range (idx2, N).
for ( i = active_nodes . succ [ idx2 ] ; i < N ; i = active_nodes . succ [ i ] )
f_weighted ( & D_ ( idx2 , i ) , D_ ( idx1 , i ) ) ;
break ;
case METHOD_METR_WARD :
/*
Ward linkage .
Shorter and longer distances can occur , not smaller than min ( d1 , d2 )
but maybe bigger than max ( d1 , d2 ) .
*/
// Update the distance matrix in the range [start, idx1).
//t_float v = static_cast<t_float>(members[i]);
for ( i = active_nodes . start ; i < idx1 ; i = active_nodes . succ [ i ] )
f_ward ( & D_ ( i , idx2 ) , D_ ( i , idx1 ) , min ,
size1 , size2 , static_cast < t_float > ( members [ i ] ) ) ;
// Update the distance matrix in the range (idx1, idx2).
for ( ; i < idx2 ; i = active_nodes . succ [ i ] )
f_ward ( & D_ ( i , idx2 ) , D_ ( idx1 , i ) , min ,
size1 , size2 , static_cast < t_float > ( members [ i ] ) ) ;
// Update the distance matrix in the range (idx2, N).
for ( i = active_nodes . succ [ idx2 ] ; i < N ; i = active_nodes . succ [ i ] )
f_ward ( & D_ ( idx2 , i ) , D_ ( idx1 , i ) , min ,
size1 , size2 , static_cast < t_float > ( members [ i ] ) ) ;
break ;
default :
throw std : : runtime_error ( std : : string ( " Invalid method. " ) ) ;
}
}
# ifdef FE_INVALID
if ( fetestexcept ( FE_INVALID ) ) throw fenv_error ( ) ;
# endif
}
class binary_min_heap {
/*
Class for a binary min - heap . The data resides in an array A . The elements of
A are not changed but two lists I and R of indices are generated which point
to elements of A and backwards .
The heap tree structure is
H [ 2 * i + 1 ] H [ 2 * i + 2 ]
\ /
\ /
≤ ≤
\ /
\ /
H [ i ]
where the children must be less or equal than their parent . Thus , H [ 0 ]
contains the minimum . The lists I and R are made such that H [ i ] = A [ I [ i ] ]
and R [ I [ i ] ] = i .
This implementation is not designed to handle NaN values .
*/
private :
t_float * const A ;
t_index size ;
auto_array_ptr < t_index > I ;
auto_array_ptr < t_index > R ;
// no default constructor
binary_min_heap ( ) ;
// noncopyable
binary_min_heap ( binary_min_heap const & ) ;
binary_min_heap & operator = ( binary_min_heap const & ) ;
public :
binary_min_heap ( t_float * const A_ , const t_index size_ )
: A ( A_ ) , size ( size_ ) , I ( size ) , R ( size )
{ // Allocate memory and initialize the lists I and R to the identity. This
// does not make it a heap. Call heapify afterwards!
for ( t_index i = 0 ; i < size ; + + i )
R [ i ] = I [ i ] = i ;
}
binary_min_heap ( t_float * const A_ , const t_index size1 , const t_index size2 ,
const t_index start )
: A ( A_ ) , size ( size1 ) , I ( size1 ) , R ( size2 )
{ // Allocate memory and initialize the lists I and R to the identity. This
// does not make it a heap. Call heapify afterwards!
for ( t_index i = 0 ; i < size ; + + i ) {
R [ i + start ] = i ;
I [ i ] = i + start ;
}
}
~ binary_min_heap ( ) { }
void heapify ( ) {
// Arrange the indices I and R so that H[i] := A[I[i]] satisfies the heap
// condition H[i] < H[2*i+1] and H[i] < H[2*i+2] for each i.
//
// Complexity: Θ(size)
// Reference: Cormen, Leiserson, Rivest, Stein, Introduction to Algorithms,
// 3rd ed., 2009, Section 6.3 “Building a heap”
t_index idx ;
for ( idx = ( size > > 1 ) ; idx > 0 ; ) {
- - idx ;
update_geq_ ( idx ) ;
}
}
inline t_index argmin ( ) const {
// Return the minimal element.
return I [ 0 ] ;
}
void heap_pop ( ) {
// Remove the minimal element from the heap.
- - size ;
I [ 0 ] = I [ size ] ;
R [ I [ 0 ] ] = 0 ;
update_geq_ ( 0 ) ;
}
void remove ( t_index idx ) {
// Remove an element from the heap.
- - size ;
R [ I [ size ] ] = R [ idx ] ;
I [ R [ idx ] ] = I [ size ] ;
if ( H ( size ) < = A [ idx ] ) {
update_leq_ ( R [ idx ] ) ;
}
else {
update_geq_ ( R [ idx ] ) ;
}
}
void replace ( const t_index idxold , const t_index idxnew ,
const t_float val ) {
R [ idxnew ] = R [ idxold ] ;
I [ R [ idxnew ] ] = idxnew ;
if ( val < = A [ idxold ] )
update_leq ( idxnew , val ) ;
else
update_geq ( idxnew , val ) ;
}
void update ( const t_index idx , const t_float val ) const {
// Update the element A[i] with val and re-arrange the indices to preserve
// the heap condition.
if ( val < = A [ idx ] )
update_leq ( idx , val ) ;
else
update_geq ( idx , val ) ;
}
void update_leq ( const t_index idx , const t_float val ) const {
// Use this when the new value is not more than the old value.
A [ idx ] = val ;
update_leq_ ( R [ idx ] ) ;
}
void update_geq ( const t_index idx , const t_float val ) const {
// Use this when the new value is not less than the old value.
A [ idx ] = val ;
update_geq_ ( R [ idx ] ) ;
}
private :
void update_leq_ ( t_index i ) const {
t_index j ;
for ( ; ( i > 0 ) & & ( H ( i ) < H ( j = ( i - 1 ) > > 1 ) ) ; i = j )
heap_swap ( i , j ) ;
}
void update_geq_ ( t_index i ) const {
t_index j ;
for ( ; ( j = 2 * i + 1 ) < size ; i = j ) {
if ( H ( j ) > = H ( i ) ) {
+ + j ;
if ( j > = size | | H ( j ) > = H ( i ) ) break ;
}
else if ( j + 1 < size & & H ( j + 1 ) < H ( j ) ) + + j ;
heap_swap ( i , j ) ;
}
}
void heap_swap ( const t_index i , const t_index j ) const {
// Swap two indices.
t_index tmp = I [ i ] ;
I [ i ] = I [ j ] ;
I [ j ] = tmp ;
R [ I [ i ] ] = i ;
R [ I [ j ] ] = j ;
}
inline t_float H ( const t_index i ) const {
return A [ I [ i ] ] ;
}
} ;
template < method_codes method , typename t_members >
static void generic_linkage ( const t_index N , t_float * const D , t_members * const members , cluster_result & Z2 ) {
/*
N : integer , number of data points
D : condensed distance matrix N * ( N - 1 ) / 2
Z2 : output data structure
*/
const t_index N_1 = N - 1 ;
t_index i , j ; // loop variables
t_index idx1 , idx2 ; // row and column indices
auto_array_ptr < t_index > n_nghbr ( N_1 ) ; // array of nearest neighbors
auto_array_ptr < t_float > mindist ( N_1 ) ; // distances to the nearest neighbors
auto_array_ptr < t_index > row_repr ( N ) ; // row_repr[i]: node number that the
// i-th row represents
doubly_linked_list active_nodes ( N ) ;
binary_min_heap nn_distances ( & * mindist , N_1 ) ; // minimum heap structure for
// the distance to the nearest neighbor of each point
t_index node1 , node2 ; // node numbers in the output
t_float size1 , size2 ; // and their cardinalities
t_float min ; // minimum and row index for nearest-neighbor search
t_index idx ;
for ( i = 0 ; i < N ; + + i )
// Build a list of row ↔ node label assignments.
// Initially i ↦ i
row_repr [ i ] = i ;
// Initialize the minimal distances:
// Find the nearest neighbor of each point.
// n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
t_float const * DD = D ;
for ( i = 0 ; i < N_1 ; + + i ) {
min = std : : numeric_limits < t_float > : : infinity ( ) ;
for ( idx = j = i + 1 ; j < N ; + + j , + + DD ) {
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wfloat-equal"
# endif
if ( * DD < min ) {
min = * DD ;
idx = j ;
}
else if ( fc_isnan ( * DD ) )
throw ( nan_error ( ) ) ;
}
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic pop
# endif
mindist [ i ] = min ;
n_nghbr [ i ] = idx ;
}
// Put the minimal distances into a heap structure to make the repeated
// global minimum searches fast.
nn_distances . heapify ( ) ;
# ifdef FE_INVALID
if ( feclearexcept ( FE_INVALID ) ) throw fenv_error ( ) ;
# endif
// Main loop: We have N-1 merging steps.
for ( i = 0 ; i < N_1 ; + + i ) {
/*
Here is a special feature that allows fast bookkeeping and updates of the
minimal distances .
mindist [ i ] stores a lower bound on the minimum distance of the point i to
all points of higher index :
mindist [ i ] ≥ min_ { j > i } D ( i , j )
Normally , we have equality . However , this minimum may become invalid due
to the updates in the distance matrix . The rules are :
1 ) If mindist [ i ] is equal to D ( i , n_nghbr [ i ] ) , this is the correct
minimum and n_nghbr [ i ] is a nearest neighbor .
2 ) If mindist [ i ] is smaller than D ( i , n_nghbr [ i ] ) , this might not be the
correct minimum . The minimum needs to be recomputed .
3 ) mindist [ i ] is never bigger than the true minimum . Hence , we never
miss the true minimum if we take the smallest mindist entry ,
re - compute the value if necessary ( thus maybe increasing it ) and
looking for the now smallest mindist entry until a valid minimal
entry is found . This step is done in the lines below .
The update process for D below takes care that these rules are
fulfilled . This makes sure that the minima in the rows D ( i , i + 1 : ) of D are
re - calculated when necessary but re - calculation is avoided whenever
possible .
The re - calculation of the minima makes the worst - case runtime of this
algorithm cubic in N . We avoid this whenever possible , and in most cases
the runtime appears to be quadratic .
*/
idx1 = nn_distances . argmin ( ) ;
if ( method ! = METHOD_METR_SINGLE ) {
while ( mindist [ idx1 ] < D_ ( idx1 , n_nghbr [ idx1 ] ) ) {
// Recompute the minimum mindist[idx1] and n_nghbr[idx1].
n_nghbr [ idx1 ] = j = active_nodes . succ [ idx1 ] ; // exists, maximally N-1
min = D_ ( idx1 , j ) ;
for ( j = active_nodes . succ [ j ] ; j < N ; j = active_nodes . succ [ j ] ) {
if ( D_ ( idx1 , j ) < min ) {
min = D_ ( idx1 , j ) ;
n_nghbr [ idx1 ] = j ;
}
}
/* Update the heap with the new true minimum and search for the
( possibly different ) minimal entry . */
nn_distances . update_geq ( idx1 , min ) ;
idx1 = nn_distances . argmin ( ) ;
}
}
nn_distances . heap_pop ( ) ; // Remove the current minimum from the heap.
idx2 = n_nghbr [ idx1 ] ;
// Write the newly found minimal pair of nodes to the output array.
node1 = row_repr [ idx1 ] ;
node2 = row_repr [ idx2 ] ;
if ( method = = METHOD_METR_AVERAGE | |
method = = METHOD_METR_WARD | |
method = = METHOD_METR_CENTROID ) {
size1 = static_cast < t_float > ( members [ idx1 ] ) ;
size2 = static_cast < t_float > ( members [ idx2 ] ) ;
members [ idx2 ] + = members [ idx1 ] ;
}
Z2 . append ( node1 , node2 , mindist [ idx1 ] ) ;
// Remove idx1 from the list of active indices (active_nodes).
active_nodes . remove ( idx1 ) ;
// Index idx2 now represents the new (merged) node with label N+i.
row_repr [ idx2 ] = N + i ;
// Update the distance matrix
switch ( method ) {
case METHOD_METR_SINGLE :
/*
Single linkage .
Characteristic : new distances are never longer than the old distances .
*/
// Update the distance matrix in the range [start, idx1).
for ( j = active_nodes . start ; j < idx1 ; j = active_nodes . succ [ j ] ) {
f_single ( & D_ ( j , idx2 ) , D_ ( j , idx1 ) ) ;
if ( n_nghbr [ j ] = = idx1 )
n_nghbr [ j ] = idx2 ;
}
// Update the distance matrix in the range (idx1, idx2).
for ( ; j < idx2 ; j = active_nodes . succ [ j ] ) {
f_single ( & D_ ( j , idx2 ) , D_ ( idx1 , j ) ) ;
// If the new value is below the old minimum in a row, update
// the mindist and n_nghbr arrays.
if ( D_ ( j , idx2 ) < mindist [ j ] ) {
nn_distances . update_leq ( j , D_ ( j , idx2 ) ) ;
n_nghbr [ j ] = idx2 ;
}
}
// Update the distance matrix in the range (idx2, N).
// Recompute the minimum mindist[idx2] and n_nghbr[idx2].
if ( idx2 < N_1 ) {
min = mindist [ idx2 ] ;
for ( j = active_nodes . succ [ idx2 ] ; j < N ; j = active_nodes . succ [ j ] ) {
f_single ( & D_ ( idx2 , j ) , D_ ( idx1 , j ) ) ;
if ( D_ ( idx2 , j ) < min ) {
n_nghbr [ idx2 ] = j ;
min = D_ ( idx2 , j ) ;
}
}
nn_distances . update_leq ( idx2 , min ) ;
}
break ;
case METHOD_METR_COMPLETE :
/*
Complete linkage .
Characteristic : new distances are never shorter than the old distances .
*/
// Update the distance matrix in the range [start, idx1).
for ( j = active_nodes . start ; j < idx1 ; j = active_nodes . succ [ j ] ) {
f_complete ( & D_ ( j , idx2 ) , D_ ( j , idx1 ) ) ;
if ( n_nghbr [ j ] = = idx1 )
n_nghbr [ j ] = idx2 ;
}
// Update the distance matrix in the range (idx1, idx2).
for ( ; j < idx2 ; j = active_nodes . succ [ j ] )
f_complete ( & D_ ( j , idx2 ) , D_ ( idx1 , j ) ) ;
// Update the distance matrix in the range (idx2, N).
for ( j = active_nodes . succ [ idx2 ] ; j < N ; j = active_nodes . succ [ j ] )
f_complete ( & D_ ( idx2 , j ) , D_ ( idx1 , j ) ) ;
break ;
case METHOD_METR_AVERAGE : {
/*
Average linkage .
Shorter and longer distances can occur .
*/
// Update the distance matrix in the range [start, idx1).
t_float s = size1 / ( size1 + size2 ) ;
t_float t = size2 / ( size1 + size2 ) ;
for ( j = active_nodes . start ; j < idx1 ; j = active_nodes . succ [ j ] ) {
f_average ( & D_ ( j , idx2 ) , D_ ( j , idx1 ) , s , t ) ;
if ( n_nghbr [ j ] = = idx1 )
n_nghbr [ j ] = idx2 ;
}
// Update the distance matrix in the range (idx1, idx2).
for ( ; j < idx2 ; j = active_nodes . succ [ j ] ) {
f_average ( & D_ ( j , idx2 ) , D_ ( idx1 , j ) , s , t ) ;
if ( D_ ( j , idx2 ) < mindist [ j ] ) {
nn_distances . update_leq ( j , D_ ( j , idx2 ) ) ;
n_nghbr [ j ] = idx2 ;
}
}
// Update the distance matrix in the range (idx2, N).
if ( idx2 < N_1 ) {
n_nghbr [ idx2 ] = j = active_nodes . succ [ idx2 ] ; // exists, maximally N-1
f_average ( & D_ ( idx2 , j ) , D_ ( idx1 , j ) , s , t ) ;
min = D_ ( idx2 , j ) ;
for ( j = active_nodes . succ [ j ] ; j < N ; j = active_nodes . succ [ j ] ) {
f_average ( & D_ ( idx2 , j ) , D_ ( idx1 , j ) , s , t ) ;
if ( D_ ( idx2 , j ) < min ) {
min = D_ ( idx2 , j ) ;
n_nghbr [ idx2 ] = j ;
}
}
nn_distances . update ( idx2 , min ) ;
}
break ;
}
case METHOD_METR_WEIGHTED :
/*
Weighted linkage .
Shorter and longer distances can occur .
*/
// Update the distance matrix in the range [start, idx1).
for ( j = active_nodes . start ; j < idx1 ; j = active_nodes . succ [ j ] ) {
f_weighted ( & D_ ( j , idx2 ) , D_ ( j , idx1 ) ) ;
if ( n_nghbr [ j ] = = idx1 )
n_nghbr [ j ] = idx2 ;
}
// Update the distance matrix in the range (idx1, idx2).
for ( ; j < idx2 ; j = active_nodes . succ [ j ] ) {
f_weighted ( & D_ ( j , idx2 ) , D_ ( idx1 , j ) ) ;
if ( D_ ( j , idx2 ) < mindist [ j ] ) {
nn_distances . update_leq ( j , D_ ( j , idx2 ) ) ;
n_nghbr [ j ] = idx2 ;
}
}
// Update the distance matrix in the range (idx2, N).
if ( idx2 < N_1 ) {
n_nghbr [ idx2 ] = j = active_nodes . succ [ idx2 ] ; // exists, maximally N-1
f_weighted ( & D_ ( idx2 , j ) , D_ ( idx1 , j ) ) ;
min = D_ ( idx2 , j ) ;
for ( j = active_nodes . succ [ j ] ; j < N ; j = active_nodes . succ [ j ] ) {
f_weighted ( & D_ ( idx2 , j ) , D_ ( idx1 , j ) ) ;
if ( D_ ( idx2 , j ) < min ) {
min = D_ ( idx2 , j ) ;
n_nghbr [ idx2 ] = j ;
}
}
nn_distances . update ( idx2 , min ) ;
}
break ;
case METHOD_METR_WARD :
/*
Ward linkage .
Shorter and longer distances can occur , not smaller than min ( d1 , d2 )
but maybe bigger than max ( d1 , d2 ) .
*/
// Update the distance matrix in the range [start, idx1).
for ( j = active_nodes . start ; j < idx1 ; j = active_nodes . succ [ j ] ) {
f_ward ( & D_ ( j , idx2 ) , D_ ( j , idx1 ) , mindist [ idx1 ] ,
size1 , size2 , static_cast < t_float > ( members [ j ] ) ) ;
if ( n_nghbr [ j ] = = idx1 )
n_nghbr [ j ] = idx2 ;
}
// Update the distance matrix in the range (idx1, idx2).
for ( ; j < idx2 ; j = active_nodes . succ [ j ] ) {
f_ward ( & D_ ( j , idx2 ) , D_ ( idx1 , j ) , mindist [ idx1 ] , size1 , size2 ,
static_cast < t_float > ( members [ j ] ) ) ;
if ( D_ ( j , idx2 ) < mindist [ j ] ) {
nn_distances . update_leq ( j , D_ ( j , idx2 ) ) ;
n_nghbr [ j ] = idx2 ;
}
}
// Update the distance matrix in the range (idx2, N).
if ( idx2 < N_1 ) {
n_nghbr [ idx2 ] = j = active_nodes . succ [ idx2 ] ; // exists, maximally N-1
f_ward ( & D_ ( idx2 , j ) , D_ ( idx1 , j ) , mindist [ idx1 ] ,
size1 , size2 , static_cast < t_float > ( members [ j ] ) ) ;
min = D_ ( idx2 , j ) ;
for ( j = active_nodes . succ [ j ] ; j < N ; j = active_nodes . succ [ j ] ) {
f_ward ( & D_ ( idx2 , j ) , D_ ( idx1 , j ) , mindist [ idx1 ] ,
size1 , size2 , static_cast < t_float > ( members [ j ] ) ) ;
if ( D_ ( idx2 , j ) < min ) {
min = D_ ( idx2 , j ) ;
n_nghbr [ idx2 ] = j ;
}
}
nn_distances . update ( idx2 , min ) ;
}
break ;
case METHOD_METR_CENTROID : {
/*
Centroid linkage .
Shorter and longer distances can occur , not bigger than max ( d1 , d2 )
but maybe smaller than min ( d1 , d2 ) .
*/
// Update the distance matrix in the range [start, idx1).
t_float s = size1 / ( size1 + size2 ) ;
t_float t = size2 / ( size1 + size2 ) ;
t_float stc = s * t * mindist [ idx1 ] ;
for ( j = active_nodes . start ; j < idx1 ; j = active_nodes . succ [ j ] ) {
f_centroid ( & D_ ( j , idx2 ) , D_ ( j , idx1 ) , stc , s , t ) ;
if ( D_ ( j , idx2 ) < mindist [ j ] ) {
nn_distances . update_leq ( j , D_ ( j , idx2 ) ) ;
n_nghbr [ j ] = idx2 ;
}
else if ( n_nghbr [ j ] = = idx1 )
n_nghbr [ j ] = idx2 ;
}
// Update the distance matrix in the range (idx1, idx2).
for ( ; j < idx2 ; j = active_nodes . succ [ j ] ) {
f_centroid ( & D_ ( j , idx2 ) , D_ ( idx1 , j ) , stc , s , t ) ;
if ( D_ ( j , idx2 ) < mindist [ j ] ) {
nn_distances . update_leq ( j , D_ ( j , idx2 ) ) ;
n_nghbr [ j ] = idx2 ;
}
}
// Update the distance matrix in the range (idx2, N).
if ( idx2 < N_1 ) {
n_nghbr [ idx2 ] = j = active_nodes . succ [ idx2 ] ; // exists, maximally N-1
f_centroid ( & D_ ( idx2 , j ) , D_ ( idx1 , j ) , stc , s , t ) ;
min = D_ ( idx2 , j ) ;
for ( j = active_nodes . succ [ j ] ; j < N ; j = active_nodes . succ [ j ] ) {
f_centroid ( & D_ ( idx2 , j ) , D_ ( idx1 , j ) , stc , s , t ) ;
if ( D_ ( idx2 , j ) < min ) {
min = D_ ( idx2 , j ) ;
n_nghbr [ idx2 ] = j ;
}
}
nn_distances . update ( idx2 , min ) ;
}
break ;
}
case METHOD_METR_MEDIAN : {
/*
Median linkage .
Shorter and longer distances can occur , not bigger than max ( d1 , d2 )
but maybe smaller than min ( d1 , d2 ) .
*/
// Update the distance matrix in the range [start, idx1).
t_float c_4 = mindist [ idx1 ] * .25 ;
for ( j = active_nodes . start ; j < idx1 ; j = active_nodes . succ [ j ] ) {
f_median ( & D_ ( j , idx2 ) , D_ ( j , idx1 ) , c_4 ) ;
if ( D_ ( j , idx2 ) < mindist [ j ] ) {
nn_distances . update_leq ( j , D_ ( j , idx2 ) ) ;
n_nghbr [ j ] = idx2 ;
}
else if ( n_nghbr [ j ] = = idx1 )
n_nghbr [ j ] = idx2 ;
}
// Update the distance matrix in the range (idx1, idx2).
for ( ; j < idx2 ; j = active_nodes . succ [ j ] ) {
f_median ( & D_ ( j , idx2 ) , D_ ( idx1 , j ) , c_4 ) ;
if ( D_ ( j , idx2 ) < mindist [ j ] ) {
nn_distances . update_leq ( j , D_ ( j , idx2 ) ) ;
n_nghbr [ j ] = idx2 ;
}
}
// Update the distance matrix in the range (idx2, N).
if ( idx2 < N_1 ) {
n_nghbr [ idx2 ] = j = active_nodes . succ [ idx2 ] ; // exists, maximally N-1
f_median ( & D_ ( idx2 , j ) , D_ ( idx1 , j ) , c_4 ) ;
min = D_ ( idx2 , j ) ;
for ( j = active_nodes . succ [ j ] ; j < N ; j = active_nodes . succ [ j ] ) {
f_median ( & D_ ( idx2 , j ) , D_ ( idx1 , j ) , c_4 ) ;
if ( D_ ( idx2 , j ) < min ) {
min = D_ ( idx2 , j ) ;
n_nghbr [ idx2 ] = j ;
}
}
nn_distances . update ( idx2 , min ) ;
}
break ;
}
default :
throw std : : runtime_error ( std : : string ( " Invalid method. " ) ) ;
}
}
# ifdef FE_INVALID
if ( fetestexcept ( FE_INVALID ) ) throw fenv_error ( ) ;
# endif
}
/*
Clustering methods for vector data
*/
template < typename t_dissimilarity >
static void MST_linkage_core_vector ( const t_index N ,
t_dissimilarity & dist ,
cluster_result & Z2 ) {
/*
N : integer , number of data points
dist : function pointer to the metric
Z2 : output data structure
The basis of this algorithm is an algorithm by Rohlf :
F . James Rohlf , Hierarchical clustering using the minimum spanning tree ,
The Computer Journal , vol . 16 , 1973 , p . 93 – 95.
*/
t_index i ;
t_index idx2 ;
doubly_linked_list active_nodes ( N ) ;
auto_array_ptr < t_float > d ( N ) ;
t_index prev_node ;
t_float min ;
// first iteration
idx2 = 1 ;
min = std : : numeric_limits < t_float > : : infinity ( ) ;
for ( i = 1 ; i < N ; + + i ) {
d [ i ] = dist ( 0 , i ) ;
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wfloat-equal"
# endif
if ( d [ i ] < min ) {
min = d [ i ] ;
idx2 = i ;
}
else if ( fc_isnan ( d [ i ] ) )
throw ( nan_error ( ) ) ;
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic pop
# endif
}
Z2 . append ( 0 , idx2 , min ) ;
for ( t_index j = 1 ; j < N - 1 ; + + j ) {
prev_node = idx2 ;
active_nodes . remove ( prev_node ) ;
idx2 = active_nodes . succ [ 0 ] ;
min = d [ idx2 ] ;
for ( i = idx2 ; i < N ; i = active_nodes . succ [ i ] ) {
t_float tmp = dist ( i , prev_node ) ;
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wfloat-equal"
# endif
if ( d [ i ] > tmp )
d [ i ] = tmp ;
else if ( fc_isnan ( tmp ) )
throw ( nan_error ( ) ) ;
# if HAVE_DIAGNOSTIC
# pragma GCC diagnostic pop
# endif
if ( d [ i ] < min ) {
min = d [ i ] ;
idx2 = i ;
}
}
Z2 . append ( prev_node , idx2 , min ) ;
}
}
template < method_codes_vector method , typename t_dissimilarity >
static void generic_linkage_vector ( const t_index N ,
t_dissimilarity & dist ,
cluster_result & Z2 ) {
/*
N : integer , number of data points
dist : function pointer to the metric
Z2 : output data structure
This algorithm is valid for the distance update methods
" Ward " , " centroid " and " median " only !
*/
const t_index N_1 = N - 1 ;
t_index i , j ; // loop variables
t_index idx1 , idx2 ; // row and column indices
auto_array_ptr < t_index > n_nghbr ( N_1 ) ; // array of nearest neighbors
auto_array_ptr < t_float > mindist ( N_1 ) ; // distances to the nearest neighbors
auto_array_ptr < t_index > row_repr ( N ) ; // row_repr[i]: node number that the
// i-th row represents
doubly_linked_list active_nodes ( N ) ;
binary_min_heap nn_distances ( & * mindist , N_1 ) ; // minimum heap structure for
// the distance to the nearest neighbor of each point
t_index node1 , node2 ; // node numbers in the output
t_float min ; // minimum and row index for nearest-neighbor search
for ( i = 0 ; i < N ; + + i )
// Build a list of row ↔ node label assignments.
// Initially i ↦ i
row_repr [ i ] = i ;
// Initialize the minimal distances:
// Find the nearest neighbor of each point.
// n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
for ( i = 0 ; i < N_1 ; + + i ) {
min = std : : numeric_limits < t_float > : : infinity ( ) ;
t_index idx ;
for ( idx = j = i + 1 ; j < N ; + + j ) {
t_float tmp ;
switch ( method ) {
case METHOD_VECTOR_WARD :
tmp = dist . ward_initial ( i , j ) ;
break ;
default :
tmp = dist . template sqeuclidean < true > ( i , j ) ;
}
if ( tmp < min ) {
min = tmp ;
idx = j ;
}
}
switch ( method ) {
case METHOD_VECTOR_WARD :
mindist [ i ] = t_dissimilarity : : ward_initial_conversion ( min ) ;
break ;
default :
mindist [ i ] = min ;
}
n_nghbr [ i ] = idx ;
}
// Put the minimal distances into a heap structure to make the repeated
// global minimum searches fast.
nn_distances . heapify ( ) ;
// Main loop: We have N-1 merging steps.
for ( i = 0 ; i < N_1 ; + + i ) {
idx1 = nn_distances . argmin ( ) ;
while ( active_nodes . is_inactive ( n_nghbr [ idx1 ] ) ) {
// Recompute the minimum mindist[idx1] and n_nghbr[idx1].
n_nghbr [ idx1 ] = j = active_nodes . succ [ idx1 ] ; // exists, maximally N-1
switch ( method ) {
case METHOD_VECTOR_WARD :
min = dist . ward ( idx1 , j ) ;
for ( j = active_nodes . succ [ j ] ; j < N ; j = active_nodes . succ [ j ] ) {
t_float const tmp = dist . ward ( idx1 , j ) ;
if ( tmp < min ) {
min = tmp ;
n_nghbr [ idx1 ] = j ;
}
}
break ;
default :
min = dist . template sqeuclidean < true > ( idx1 , j ) ;
for ( j = active_nodes . succ [ j ] ; j < N ; j = active_nodes . succ [ j ] ) {
t_float const tmp = dist . template sqeuclidean < true > ( idx1 , j ) ;
if ( tmp < min ) {
min = tmp ;
n_nghbr [ idx1 ] = j ;
}
}
}
/* Update the heap with the new true minimum and search for the (possibly
different ) minimal entry . */
nn_distances . update_geq ( idx1 , min ) ;
idx1 = nn_distances . argmin ( ) ;
}
nn_distances . heap_pop ( ) ; // Remove the current minimum from the heap.
idx2 = n_nghbr [ idx1 ] ;
// Write the newly found minimal pair of nodes to the output array.
node1 = row_repr [ idx1 ] ;
node2 = row_repr [ idx2 ] ;
Z2 . append ( node1 , node2 , mindist [ idx1 ] ) ;
switch ( method ) {
case METHOD_VECTOR_WARD :
case METHOD_VECTOR_CENTROID :
dist . merge_inplace ( idx1 , idx2 ) ;
break ;
case METHOD_VECTOR_MEDIAN :
dist . merge_inplace_weighted ( idx1 , idx2 ) ;
break ;
default :
throw std : : runtime_error ( std : : string ( " Invalid method. " ) ) ;
}
// Index idx2 now represents the new (merged) node with label N+i.
row_repr [ idx2 ] = N + i ;
// Remove idx1 from the list of active indices (active_nodes).
active_nodes . remove ( idx1 ) ; // TBD later!!!
// Update the distance matrix
switch ( method ) {
case METHOD_VECTOR_WARD :
/*
Ward linkage .
Shorter and longer distances can occur , not smaller than min ( d1 , d2 )
but maybe bigger than max ( d1 , d2 ) .
*/
// Update the distance matrix in the range [start, idx1).
for ( j = active_nodes . start ; j < idx1 ; j = active_nodes . succ [ j ] ) {
if ( n_nghbr [ j ] = = idx2 ) {
n_nghbr [ j ] = idx1 ; // invalidate
}
}
// Update the distance matrix in the range (idx1, idx2).
for ( ; j < idx2 ; j = active_nodes . succ [ j ] ) {
t_float const tmp = dist . ward ( j , idx2 ) ;
if ( tmp < mindist [ j ] ) {
nn_distances . update_leq ( j , tmp ) ;
n_nghbr [ j ] = idx2 ;
}
else if ( n_nghbr [ j ] = = idx2 ) {
n_nghbr [ j ] = idx1 ; // invalidate
}
}
// Find the nearest neighbor for idx2.
if ( idx2 < N_1 ) {
n_nghbr [ idx2 ] = j = active_nodes . succ [ idx2 ] ; // exists, maximally N-1
min = dist . ward ( idx2 , j ) ;
for ( j = active_nodes . succ [ j ] ; j < N ; j = active_nodes . succ [ j ] ) {
t_float const tmp = dist . ward ( idx2 , j ) ;
if ( tmp < min ) {
min = tmp ;
n_nghbr [ idx2 ] = j ;
}
}
nn_distances . update ( idx2 , min ) ;
}
break ;
default :
/*
Centroid and median linkage .
Shorter and longer distances can occur , not bigger than max ( d1 , d2 )
but maybe smaller than min ( d1 , d2 ) .
*/
for ( j = active_nodes . start ; j < idx2 ; j = active_nodes . succ [ j ] ) {
t_float const tmp = dist . template sqeuclidean < true > ( j , idx2 ) ;
if ( tmp < mindist [ j ] ) {
nn_distances . update_leq ( j , tmp ) ;
n_nghbr [ j ] = idx2 ;
}
else if ( n_nghbr [ j ] = = idx2 )
n_nghbr [ j ] = idx1 ; // invalidate
}
// Find the nearest neighbor for idx2.
if ( idx2 < N_1 ) {
n_nghbr [ idx2 ] = j = active_nodes . succ [ idx2 ] ; // exists, maximally N-1
min = dist . template sqeuclidean < true > ( idx2 , j ) ;
for ( j = active_nodes . succ [ j ] ; j < N ; j = active_nodes . succ [ j ] ) {
t_float const tmp = dist . template sqeuclidean < true > ( idx2 , j ) ;
if ( tmp < min ) {
min = tmp ;
n_nghbr [ idx2 ] = j ;
}
}
nn_distances . update ( idx2 , min ) ;
}
}
}
}
template < method_codes_vector method , typename t_dissimilarity >
static void generic_linkage_vector_alternative ( const t_index N ,
t_dissimilarity & dist ,
cluster_result & Z2 ) {
/*
N : integer , number of data points
dist : function pointer to the metric
Z2 : output data structure
This algorithm is valid for the distance update methods
" Ward " , " centroid " and " median " only !
*/
const t_index N_1 = N - 1 ;
t_index i , j = 0 ; // loop variables
t_index idx1 , idx2 ; // row and column indices
auto_array_ptr < t_index > n_nghbr ( 2 * N - 2 ) ; // array of nearest neighbors
auto_array_ptr < t_float > mindist ( 2 * N - 2 ) ; // distances to the nearest neighbors
doubly_linked_list active_nodes ( N + N_1 ) ;
binary_min_heap nn_distances ( & * mindist , N_1 , 2 * N - 2 , 1 ) ; // minimum heap
// structure for the distance to the nearest neighbor of each point
t_float min ; // minimum for nearest-neighbor searches
// Initialize the minimal distances:
// Find the nearest neighbor of each point.
// n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
for ( i = 1 ; i < N ; + + i ) {
min = std : : numeric_limits < t_float > : : infinity ( ) ;
t_index idx ;
for ( idx = j = 0 ; j < i ; + + j ) {
t_float tmp ;
switch ( method ) {
case METHOD_VECTOR_WARD :
tmp = dist . ward_initial ( i , j ) ;
break ;
default :
tmp = dist . template sqeuclidean < true > ( i , j ) ;
}
if ( tmp < min ) {
min = tmp ;
idx = j ;
}
}
switch ( method ) {
case METHOD_VECTOR_WARD :
mindist [ i ] = t_dissimilarity : : ward_initial_conversion ( min ) ;
break ;
default :
mindist [ i ] = min ;
}
n_nghbr [ i ] = idx ;
}
// Put the minimal distances into a heap structure to make the repeated
// global minimum searches fast.
nn_distances . heapify ( ) ;
// Main loop: We have N-1 merging steps.
for ( i = N ; i < N + N_1 ; + + i ) {
/*
The bookkeeping is different from the " stored matrix approach " algorithm
generic_linkage .
mindist [ i ] stores a lower bound on the minimum distance of the point i to
all points of * lower * index :
mindist [ i ] ≥ min_ { j < i } D ( i , j )
Moreover , new nodes do not re - use one of the old indices , but they are
given a new , unique index ( SciPy convention : initial nodes are 0 , … , N − 1 ,
new nodes are N , … , 2 N − 2 ) .
Invalid nearest neighbors are not recognized by the fact that the stored
distance is smaller than the actual distance , but the list active_nodes
maintains a flag whether a node is inactive . If n_nghbr [ i ] points to an
active node , the entries nn_distances [ i ] and n_nghbr [ i ] are valid ,
otherwise they must be recomputed .
*/
idx1 = nn_distances . argmin ( ) ;
while ( active_nodes . is_inactive ( n_nghbr [ idx1 ] ) ) {
// Recompute the minimum mindist[idx1] and n_nghbr[idx1].
n_nghbr [ idx1 ] = j = active_nodes . start ;
switch ( method ) {
case METHOD_VECTOR_WARD :
min = dist . ward_extended ( idx1 , j ) ;
for ( j = active_nodes . succ [ j ] ; j < idx1 ; j = active_nodes . succ [ j ] ) {
t_float tmp = dist . ward_extended ( idx1 , j ) ;
if ( tmp < min ) {
min = tmp ;
n_nghbr [ idx1 ] = j ;
}
}
break ;
default :
min = dist . sqeuclidean_extended ( idx1 , j ) ;
for ( j = active_nodes . succ [ j ] ; j < idx1 ; j = active_nodes . succ [ j ] ) {
t_float const tmp = dist . sqeuclidean_extended ( idx1 , j ) ;
if ( tmp < min ) {
min = tmp ;
n_nghbr [ idx1 ] = j ;
}
}
}
/* Update the heap with the new true minimum and search for the (possibly
different ) minimal entry . */
nn_distances . update_geq ( idx1 , min ) ;
idx1 = nn_distances . argmin ( ) ;
}
idx2 = n_nghbr [ idx1 ] ;
active_nodes . remove ( idx1 ) ;
active_nodes . remove ( idx2 ) ;
Z2 . append ( idx1 , idx2 , mindist [ idx1 ] ) ;
if ( i < 2 * N_1 ) {
switch ( method ) {
case METHOD_VECTOR_WARD :
case METHOD_VECTOR_CENTROID :
dist . merge ( idx1 , idx2 , i ) ;
break ;
case METHOD_VECTOR_MEDIAN :
dist . merge_weighted ( idx1 , idx2 , i ) ;
break ;
default :
throw std : : runtime_error ( std : : string ( " Invalid method. " ) ) ;
}
n_nghbr [ i ] = active_nodes . start ;
if ( method = = METHOD_VECTOR_WARD ) {
/*
Ward linkage .
Shorter and longer distances can occur , not smaller than min ( d1 , d2 )
but maybe bigger than max ( d1 , d2 ) .
*/
min = dist . ward_extended ( active_nodes . start , i ) ;
for ( j = active_nodes . succ [ active_nodes . start ] ; j < i ;
j = active_nodes . succ [ j ] ) {
t_float tmp = dist . ward_extended ( j , i ) ;
if ( tmp < min ) {
min = tmp ;
n_nghbr [ i ] = j ;
}
}
}
else {
/*
Centroid and median linkage .
Shorter and longer distances can occur , not bigger than max ( d1 , d2 )
but maybe smaller than min ( d1 , d2 ) .
*/
min = dist . sqeuclidean_extended ( active_nodes . start , i ) ;
for ( j = active_nodes . succ [ active_nodes . start ] ; j < i ;
j = active_nodes . succ [ j ] ) {
t_float tmp = dist . sqeuclidean_extended ( j , i ) ;
if ( tmp < min ) {
min = tmp ;
n_nghbr [ i ] = j ;
}
}
}
if ( idx2 < active_nodes . start ) {
nn_distances . remove ( active_nodes . start ) ;
} else {
nn_distances . remove ( idx2 ) ;
}
nn_distances . replace ( idx1 , i , min ) ;
}
}
}
# if HAVE_VISIBILITY
# pragma GCC visibility pop
# endif